# Pokemon Data Cleaning

In [2]:
# Import relavant libraries.
import pandas as pd

# Read the CSV file.
df = pd.read_csv("/Users/adnanhussain/Documents/Jupiter Notebook Projects/Pokemon Games/PokemonGamesData.csv")

# View dataframe.
df

Unnamed: 0,Game,Year,Units sold(in millions),GameRankings,Metacritic
0,Pokémon Red and Blue,1996,31.37[70],88%[71][72],-
1,Pokémon Yellow,1998,14.64[70],85%[73],-
2,Pokémon Gold and Silver,1999,23.73[74],90%[75][76],-
3,Pokémon Crystal,2000,6.39[70],80%[77],-
4,Pokémon Ruby and Sapphire,2002,16.22[78],84%[79][80],82/100[81]
5,Pokémon FireRed and LeafGreen,2004,12[78],81%[82][83],81/100[84][85]
6,Pokémon Emerald,2004,6.32[86],77%[87],76/100[88]
7,Pokémon Diamond and Pearl,2006,17.67[89],85%[90][91],85/100[92][93]
8,Pokémon Platinum,2008,7.06[94],83%[95],84/100[96]
9,Pokémon HeartGold and SoulSilver,2009,12.72[89],88%[97][98],87/100[99][100]


In [3]:
# Non of these are of the correct type str. We cannot remove the [XX] from the entries with the current types
x = df['Units sold(in millions)'].dtypes
y = df['GameRankings'].dtypes
z = df['Metacritic'].dtypes
print(x, y, z)

object object object


In [4]:
# Specifies which columns that are being converting to string.
convert_dict = {'Units sold(in millions)': 'string',
                'GameRankings': 'string',
                'Metacritic': 'string'}

# Much better!
df = df.astype(convert_dict)
print(df.dtypes)


Game                               object
Year                                int64
Units sold(in millions)    string[python]
GameRankings               string[python]
Metacritic                 string[python]
dtype: object


In [5]:
# Replace "-" with None type.
df = df.replace("-", None)

In [6]:
# Remove any unwanted [XX], [XXX] and %.
df = df.replace(to_replace=r"\[[0-9]{2,3}\]|%", value="", regex=True)
df['Metacritic'] = df['Metacritic'].replace(to_replace=r"", value="", regex=True)

In [7]:
# In Metacritic, convert fractions into str typed versions of decimals.
i = 0
for entry in df['Metacritic']:
    old_entry = entry
    if pd.notna(old_entry):
        new_entry = int(old_entry[0:2]) / 100
        df.loc[i ,'Metacritic'] = f"{new_entry}"
    i+=1
        
df

Unnamed: 0,Game,Year,Units sold(in millions),GameRankings,Metacritic
0,Pokémon Red and Blue,1996,31.37,88.0,
1,Pokémon Yellow,1998,14.64,85.0,
2,Pokémon Gold and Silver,1999,23.73,90.0,
3,Pokémon Crystal,2000,6.39,80.0,
4,Pokémon Ruby and Sapphire,2002,16.22,84.0,0.82
5,Pokémon FireRed and LeafGreen,2004,12.0,81.0,0.81
6,Pokémon Emerald,2004,6.32,77.0,0.76
7,Pokémon Diamond and Pearl,2006,17.67,85.0,0.85
8,Pokémon Platinum,2008,7.06,83.0,0.84
9,Pokémon HeartGold and SoulSilver,2009,12.72,88.0,0.87


In [8]:
# Replace column types with suitable column types.
convert_dict = {
    'Game': 'string',
    'Year': pd.Int64Dtype(),
    'Units sold(in millions)': float,
    'GameRankings': pd.Int64Dtype(),
    'Metacritic': float
    }

df = df.astype(convert_dict)
print(df.dtypes)

Game                       string[python]
Year                                Int64
Units sold(in millions)           float64
GameRankings                        Int64
Metacritic                        float64
dtype: object


In [16]:
# Save dataframe as a CSV file for later use!
df.to_csv('/Users/adnanhussain/Documents/PokemonGamesCleaned.csv', index=False) 