**DATA CLEANING**

In [1]:
import pandas as pd

**data cleaning = the process of fixing/removing
incomplete, incorrect or irrelevant data.
-75 % of work done with pandas is data cleaning.**

In [33]:
df = pd.read_csv("pokemon.csv")
print(df.head())

   #           Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  \
0  1      Bulbasaur  Grass  Poison  45      49       49       65       65   
1  2        Ivysaur  Grass  Poison  60      62       63       80       80   
2  3       Venusaur  Grass  Poison  80      82       83      100      100   
3  4  Mega Venusaur  Grass  Poison  80     100      123      122      120   
4  5     Charmander   Fire     NaN  39      52       43       60       50   

   Speed  Generation  Legendary  
0     45           1      False  
1     60           1      False  
2     80           1      False  
3     80           1      False  
4     65           1      False  


**1. DROP IRRELEVANT COLUMNS**

In [34]:
df = df.drop(columns=['Legendary'],errors='ignore')
print(df.head())


   #           Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  \
0  1      Bulbasaur  Grass  Poison  45      49       49       65       65   
1  2        Ivysaur  Grass  Poison  60      62       63       80       80   
2  3       Venusaur  Grass  Poison  80      82       83      100      100   
3  4  Mega Venusaur  Grass  Poison  80     100      123      122      120   
4  5     Charmander   Fire     NaN  39      52       43       60       50   

   Speed  Generation  
0     45           1  
1     60           1  
2     80           1  
3     80           1  
4     65           1  


**2.HANDLE MISSING DATA**

In [18]:
df = df.dropna(subset=['Type 2'])
print(df.to_string())

       #                       Name    Type 1    Type 2   HP  Attack  Defense  Sp. Atk  Sp. Def  Speed  Generation
0      1                  Bulbasaur     Grass    Poison   45      49       49       65       65     45           1
1      2                    Ivysaur     Grass    Poison   60      62       63       80       80     60           1
2      3                   Venusaur     Grass    Poison   80      82       83      100      100     80           1
3      4              Mega Venusaur     Grass    Poison   80     100      123      122      120     80           1
6      7                  Charizard      Fire    Flying   78      84       78      109       85    100           1
7      8           Mega Charizard X      Fire    Dragon   78     130      111      130       85    100           1
8      9           Mega Charizard Y      Fire    Flying   78     104       78      159      115    100           1
15    16                 Butterfree       Bug    Flying   60      45       50   

In [35]:
df = df.fillna({"Type 2":0})
print(df.head())

   #           Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  \
0  1      Bulbasaur  Grass  Poison  45      49       49       65       65   
1  2        Ivysaur  Grass  Poison  60      62       63       80       80   
2  3       Venusaur  Grass  Poison  80      82       83      100      100   
3  4  Mega Venusaur  Grass  Poison  80     100      123      122      120   
4  5     Charmander   Fire       0  39      52       43       60       50   

   Speed  Generation  
0     45           1  
1     60           1  
2     80           1  
3     80           1  
4     65           1  


**3. FIX INCONSISTENT VALUES**

In [23]:
df["Type 1"] = df["Type 1"].replace({"Grass":"GRASS"})
print(df.head())

   #           Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  \
0  1      Bulbasaur  GRASS  Poison  45      49       49       65       65   
1  2        Ivysaur  GRASS  Poison  60      62       63       80       80   
2  3       Venusaur  GRASS  Poison  80      82       83      100      100   
3  4  Mega Venusaur  GRASS  Poison  80     100      123      122      120   
6  7      Charizard   Fire  Flying  78      84       78      109       85   

   Speed  Generation  
0     45           1  
1     60           1  
2     80           1  
3     80           1  
6    100           1  


**4.STANDARDIZE TEXT**

In [26]:
df["Type 1"] = df["Type 1"].str.lower()
print(df.head())

   #           Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  \
0  1      bulbasaur  grass  Poison  45      49       49       65       65   
1  2        ivysaur  grass  Poison  60      62       63       80       80   
2  3       venusaur  grass  Poison  80      82       83      100      100   
3  4  mega venusaur  grass  Poison  80     100      123      122      120   
6  7      charizard   fire  Flying  78      84       78      109       85   

   Speed  Generation  
0     45           1  
1     60           1  
2     80           1  
3     80           1  
6    100           1  


**5. FIXING DATA TYPE**


In [28]:
df["Legendary"] = df["Legendary"].astype(int)
print(df.head())

   #           Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  \
0  1      Bulbasaur  Grass  Poison  45      49       49       65       65   
1  2        Ivysaur  Grass  Poison  60      62       63       80       80   
2  3       Venusaur  Grass  Poison  80      82       83      100      100   
3  4  Mega Venusaur  Grass  Poison  80     100      123      122      120   
4  5     Charmander   Fire     NaN  39      52       43       60       50   

   Speed  Generation  Legendary  
0     45           1          0  
1     60           1          0  
2     80           1          0  
3     80           1          0  
4     65           1          0  


**6.REMOVE DUPLICATE VALUES**

In [32]:
df = df.drop_duplicates()
print(df.head())

   #           Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  \
0  1      Bulbasaur  Grass  Poison  45      49       49       65       65   
1  2        Ivysaur  Grass  Poison  60      62       63       80       80   
2  3       Venusaur  Grass  Poison  80      82       83      100      100   
3  4  Mega Venusaur  Grass  Poison  80     100      123      122      120   
4  5     Charmander   Fire     NaN  39      52       43       60       50   

   Speed  Generation  Legendary  
0     45           1          0  
1     60           1          0  
2     80           1          0  
3     80           1          0  
4     65           1          0  
