In [1]:
import pandas as pd

# Data Cleaning
## Data cleaning in pandas is the process of fixing, correcting, or removing incorrect, inconsistent, or missing data in a DataFrame so that your dataset becomes accurate and useful for analysis or machine learning. In simple terms Data Cleaning is the process of fixing and removing: incomplete, incorrect, or irrelevant data. 
# ~75% of work done in Pandas is Data Cleaning

Real-world datasets often have:

Missing values

Duplicates

Wrong data types

Outliers

Inconsistent formatting

Noise or irrelevant rows

Cleaning makes the data reliable, structured, and ready for analysis

In [2]:
df = pd.read_csv(r"C:\Users\Ryuk\Jupyter\Notes\importing\pokemon-150.csv")
print(df)

      No        Name    Type1   Type2  Height  Weight  Legendary
0      1   Bulbasaur    Grass  Poison     0.7     6.9          0
1      1   Bulbasaur    Grass  Poison     0.7     6.9          0
2      2     Ivysaur    Grass  Poison     1.0    13.0          0
3      3    Venusaur    Grass  Poison     2.0   100.0          0
4      4  Charmander     Fire     NaN     0.6     8.5          0
..   ...         ...      ...     ...     ...     ...        ...
147  147     Dratini   Dragon     NaN     1.8     3.3          0
148  148   Dragonair   Dragon     NaN     4.0    16.5          0
149  149   Dragonite   Dragon  Flying     2.2   210.0          0
150  150      Mewtwo  Psychic     NaN     2.0   122.0          1
151  150      Mewtwo  Psychic     NaN     2.0   122.0          1

[152 rows x 7 columns]


## Drop irrelevant columns

In [3]:
df = df.drop(columns=["Type2","No"])
print(df)

           Name    Type1  Height  Weight  Legendary
0     Bulbasaur    Grass     0.7     6.9          0
1     Bulbasaur    Grass     0.7     6.9          0
2       Ivysaur    Grass     1.0    13.0          0
3      Venusaur    Grass     2.0   100.0          0
4    Charmander     Fire     0.6     8.5          0
..          ...      ...     ...     ...        ...
147     Dratini   Dragon     1.8     3.3          0
148   Dragonair   Dragon     4.0    16.5          0
149   Dragonite   Dragon     2.2   210.0          0
150      Mewtwo  Psychic     2.0   122.0          1
151      Mewtwo  Psychic     2.0   122.0          1

[152 rows x 5 columns]


# Handling missing data
in our csv file every pokemon possess a type1 but not all have a type2. Hence some many of the Type2 data is empty. We are going to drop the entire so whose type2 is null.

In [4]:
df = pd.read_csv(r"C:\Users\Ryuk\Jupyter\Notes\importing\pokemon-150.csv")
print(df)

df = df.dropna(subset=["Type2"]) 
print(df.to_string())   #dropna = drop not available

      No        Name    Type1   Type2  Height  Weight  Legendary
0      1   Bulbasaur    Grass  Poison     0.7     6.9          0
1      1   Bulbasaur    Grass  Poison     0.7     6.9          0
2      2     Ivysaur    Grass  Poison     1.0    13.0          0
3      3    Venusaur    Grass  Poison     2.0   100.0          0
4      4  Charmander     Fire     NaN     0.6     8.5          0
..   ...         ...      ...     ...     ...     ...        ...
147  147     Dratini   Dragon     NaN     1.8     3.3          0
148  148   Dragonair   Dragon     NaN     4.0    16.5          0
149  149   Dragonite   Dragon  Flying     2.2   210.0          0
150  150      Mewtwo  Psychic     NaN     2.0   122.0          1
151  150      Mewtwo  Psychic     NaN     2.0   122.0          1

[152 rows x 7 columns]
      No        Name     Type1     Type2  Height  Weight  Legendary
0      1   Bulbasaur     Grass    Poison     0.7     6.9          0
1      1   Bulbasaur     Grass    Poison     0.7     6.9    

## replacing empty value rows with a different function

In [5]:
df = pd.read_csv(r"C:\Users\Ryuk\Jupyter\Notes\importing\pokemon-150.csv")

df = df.fillna({"Type2":"None"})
print(df.to_string())

      No        Name     Type1     Type2  Height  Weight  Legendary
0      1   Bulbasaur     Grass    Poison     0.7     6.9          0
1      1   Bulbasaur     Grass    Poison     0.7     6.9          0
2      2     Ivysaur     Grass    Poison     1.0    13.0          0
3      3    Venusaur     Grass    Poison     2.0   100.0          0
4      4  Charmander      Fire      None     0.6     8.5          0
5      5  Charmeleon      Fire      None     1.1    19.0          0
6      6   Charizard      Fire    Flying     1.7    90.5          0
7      7    Squirtle     Water      None     0.5     9.0          0
8      8   Wartortle     Water      None     1.0    22.5          0
9      9   Blastoise     Water      None     1.6    85.5          0
10    10    Caterpie       Bug      None     0.3     2.9          0
11    11     Metapod       Bug      None     0.7     9.9          0
12    12  Butterfree       Bug    Flying     1.1    32.0          0
13    13      Weedle       Bug    Poison     0.3

## changing already existing data
fix inconsistent values


In [6]:
df["Type1"] = df["Type1"].replace({"Grass": "GRASS"})
print(df)

      No        Name    Type1   Type2  Height  Weight  Legendary
0      1   Bulbasaur    GRASS  Poison     0.7     6.9          0
1      1   Bulbasaur    GRASS  Poison     0.7     6.9          0
2      2     Ivysaur    GRASS  Poison     1.0    13.0          0
3      3    Venusaur    GRASS  Poison     2.0   100.0          0
4      4  Charmander     Fire    None     0.6     8.5          0
..   ...         ...      ...     ...     ...     ...        ...
147  147     Dratini   Dragon    None     1.8     3.3          0
148  148   Dragonair   Dragon    None     4.0    16.5          0
149  149   Dragonite   Dragon  Flying     2.2   210.0          0
150  150      Mewtwo  Psychic    None     2.0   122.0          1
151  150      Mewtwo  Psychic    None     2.0   122.0          1

[152 rows x 7 columns]


In [7]:
df["Type1"] = df["Type1"].replace({"Grass": "GRASS", "Fire": "FIRE"})
print(df)

      No        Name    Type1   Type2  Height  Weight  Legendary
0      1   Bulbasaur    GRASS  Poison     0.7     6.9          0
1      1   Bulbasaur    GRASS  Poison     0.7     6.9          0
2      2     Ivysaur    GRASS  Poison     1.0    13.0          0
3      3    Venusaur    GRASS  Poison     2.0   100.0          0
4      4  Charmander     FIRE    None     0.6     8.5          0
..   ...         ...      ...     ...     ...     ...        ...
147  147     Dratini   Dragon    None     1.8     3.3          0
148  148   Dragonair   Dragon    None     4.0    16.5          0
149  149   Dragonite   Dragon  Flying     2.2   210.0          0
150  150      Mewtwo  Psychic    None     2.0   122.0          1
151  150      Mewtwo  Psychic    None     2.0   122.0          1

[152 rows x 7 columns]


# Standardize text

In [8]:
df["Name"] = df["Name"].str.lower()
print(df)

      No        Name    Type1   Type2  Height  Weight  Legendary
0      1   bulbasaur    GRASS  Poison     0.7     6.9          0
1      1   bulbasaur    GRASS  Poison     0.7     6.9          0
2      2     ivysaur    GRASS  Poison     1.0    13.0          0
3      3    venusaur    GRASS  Poison     2.0   100.0          0
4      4  charmander     FIRE    None     0.6     8.5          0
..   ...         ...      ...     ...     ...     ...        ...
147  147     dratini   Dragon    None     1.8     3.3          0
148  148   dragonair   Dragon    None     4.0    16.5          0
149  149   dragonite   Dragon  Flying     2.2   210.0          0
150  150      mewtwo  Psychic    None     2.0   122.0          1
151  150      mewtwo  Psychic    None     2.0   122.0          1

[152 rows x 7 columns]


# fix or change data types

In [9]:
df["Legendary"]=df["Legendary"].astype(bool)
print(df)

      No        Name    Type1   Type2  Height  Weight  Legendary
0      1   bulbasaur    GRASS  Poison     0.7     6.9      False
1      1   bulbasaur    GRASS  Poison     0.7     6.9      False
2      2     ivysaur    GRASS  Poison     1.0    13.0      False
3      3    venusaur    GRASS  Poison     2.0   100.0      False
4      4  charmander     FIRE    None     0.6     8.5      False
..   ...         ...      ...     ...     ...     ...        ...
147  147     dratini   Dragon    None     1.8     3.3      False
148  148   dragonair   Dragon    None     4.0    16.5      False
149  149   dragonite   Dragon  Flying     2.2   210.0      False
150  150      mewtwo  Psychic    None     2.0   122.0       True
151  150      mewtwo  Psychic    None     2.0   122.0       True

[152 rows x 7 columns]


# Remove Duplicate values


In [10]:
print(df)

      No        Name    Type1   Type2  Height  Weight  Legendary
0      1   bulbasaur    GRASS  Poison     0.7     6.9      False
1      1   bulbasaur    GRASS  Poison     0.7     6.9      False
2      2     ivysaur    GRASS  Poison     1.0    13.0      False
3      3    venusaur    GRASS  Poison     2.0   100.0      False
4      4  charmander     FIRE    None     0.6     8.5      False
..   ...         ...      ...     ...     ...     ...        ...
147  147     dratini   Dragon    None     1.8     3.3      False
148  148   dragonair   Dragon    None     4.0    16.5      False
149  149   dragonite   Dragon  Flying     2.2   210.0      False
150  150      mewtwo  Psychic    None     2.0   122.0       True
151  150      mewtwo  Psychic    None     2.0   122.0       True

[152 rows x 7 columns]


Duplicate entry for bulbasaur and mewtwo are present

In [11]:
df = df.drop_duplicates()
print(df)

      No        Name    Type1   Type2  Height  Weight  Legendary
0      1   bulbasaur    GRASS  Poison     0.7     6.9      False
2      2     ivysaur    GRASS  Poison     1.0    13.0      False
3      3    venusaur    GRASS  Poison     2.0   100.0      False
4      4  charmander     FIRE    None     0.6     8.5      False
5      5  charmeleon     FIRE    None     1.1    19.0      False
..   ...         ...      ...     ...     ...     ...        ...
146  146     moltres     FIRE  Flying     2.0    60.0       True
147  147     dratini   Dragon    None     1.8     3.3      False
148  148   dragonair   Dragon    None     4.0    16.5      False
149  149   dragonite   Dragon  Flying     2.2   210.0      False
150  150      mewtwo  Psychic    None     2.0   122.0       True

[150 rows x 7 columns]
