In [35]:
import pandas as pd

In [36]:
# Initialize string constants
gameOfOrigin = 'Game(s) of Origin'
generation = 'Generation'
legendaryStatus = 'Legendary Status'
legendaryType = 'Legendary Type'
evolutionDetails = 'Evolution Details'
evolutionLevel = 'Level of Evolution'
pokemonHeight = 'Pokemon Height'
pokemonWeight = 'Pokemon Weight'
primaryType = 'Primary Type'
secondaryType = 'Secondary Type'
primaryAbility = 'Primary Ability'
secondaryAbility = 'Secondary Ability'
hiddenAbility = 'Hidden Ability'
types = 'Types'
maleFemaleRatio = 'M:F Ratio'
baseHappiness = 'Base Happiness'
health = 'Health Stat'
attack = 'Attack Stat'
defense = 'Defense Stat'
specialAttack = 'Special Attack Stat'
specialDefense = 'Special Defense Stat'
speed = 'Speed Stat'
baseStatTotal = 'Base Stat Total'
catchRate = 'Catch Rate'
experienceGrowth = 'Experience Growth'
experienceGrowthTotal = 'Experience Growth Total'
primaryEggGroup =  'Primary Egg Group'
secondaryEggGroup =  'Secondary Egg Group'
eggCycleCount = 'Egg Cycle Count'
na = "NA"

# Variable lists
variables = [gameOfOrigin, generation, primaryType, secondaryType, types, legendaryStatus, legendaryType, evolutionDetails, evolutionLevel, pokemonHeight, pokemonWeight, maleFemaleRatio, baseHappiness, health, attack, defense, specialAttack, specialDefense, speed, baseStatTotal, catchRate, experienceGrowth, experienceGrowthTotal, primaryEggGroup, secondaryEggGroup, eggCycleCount]
categoricalVariables = [gameOfOrigin, generation, primaryType, secondaryType, types, legendaryStatus, legendaryType, evolutionDetails, maleFemaleRatio, experienceGrowth, experienceGrowthTotal, primaryEggGroup, secondaryEggGroup]
specialCategoricalVariables = [types, evolutionDetails]
categoricalVariablesWithNA = [secondaryType, evolutionDetails, legendaryType, secondaryAbility, hiddenAbility, secondaryEggGroup]
quantitativeVariables= [evolutionLevel, pokemonHeight, pokemonWeight, baseHappiness, health, attack, defense, specialAttack, specialDefense, speed, baseStatTotal, catchRate, eggCycleCount]

# Evolution details specific categories
levelingUpCategory = 'Leveling Up'
tradeOrFriendshipCategory = 'Trade or Friendship'
stoneCategory = "Evolution stone"
otherCategory = 'Other'

In [37]:
df1 = pd.read_csv('Datasets/Pokemon-1.csv')
df2 = pd.read_csv('Datasets/Pokemon-2.csv')

In [38]:
df1.rename(columns={'number':'Pokedex Number', 'legendary':'Legendary Status', 'generation': 'Generation'}, inplace=True)
df1_f = df1[['Pokedex Number', 'Generation', 'Legendary Status']]

# Remove duplicate rows based on the 'Name' column, keeping the first occurrence
df1_f = df1_f.drop_duplicates(subset='Pokedex Number', keep='first')

# Keep relevant attributes in df2
print(df2.columns)
df2 = df2[['Pokemon Id', 'Pokedex Number', 'Pokemon Name', 'Classification',
       'Alternate Form Name', 'Legendary Type',
       'Pokemon Height', 'Pokemon Weight', 'Primary Type', 'Secondary Type',
       'Primary Ability', 'Secondary Ability','Hidden Ability', 'Special Event Ability',
       'Male Ratio', 'Female Ratio', 'Base Happiness', 'Game(s) of Origin', 'Health Stat', 'Attack Stat',
       'Defense Stat', 'Special Attack Stat', 'Special Defense Stat',
       'Speed Stat', 'Base Stat Total', 'Health EV', 'Attack EV', 'Defense EV',
       'Special Attack EV', 'Special Defense EV', 'Speed EV', 'EV Yield Total',
       'Catch Rate', 'Experience Growth', 'Experience Growth Total',
       'Primary Egg Group', 'Secondary Egg Group', 'Egg Cycle Count',
       'Pre-Evolution Pokemon Id', 'Evolution Details']]

Index(['Pokemon Id', 'Pokedex Number', 'Pokemon Name', 'Classification',
       'Alternate Form Name', 'Original Pokemon ID', 'Legendary Type',
       'Pokemon Height', 'Pokemon Weight', 'Primary Type', 'Secondary Type',
       'Primary Ability', 'Primary Ability Description', 'Secondary Ability',
       'Secondary Ability Description', 'Hidden Ability',
       'Hidden Ability Description', 'Special Event Ability',
       'Special Event Ability Description', 'Male Ratio', 'Female Ratio',
       'Base Happiness', 'Game(s) of Origin', 'Health Stat', 'Attack Stat',
       'Defense Stat', 'Special Attack Stat', 'Special Defense Stat',
       'Speed Stat', 'Base Stat Total', 'Health EV', 'Attack EV', 'Defense EV',
       'Special Attack EV', 'Special Defense EV', 'Speed EV', 'EV Yield Total',
       'Catch Rate', 'Experience Growth', 'Experience Growth Total',
       'Primary Egg Group', 'Secondary Egg Group', 'Egg Cycle Count',
       'Pre-Evolution Pokemon Id', 'Evolution Details'],
     

In [39]:
df = pd.merge(df2, df1_f, on='Pokedex Number', how='inner')
df = df.sort_values(by='Pokedex Number', ascending=True)

In [40]:
# Remove the row with more NULL values in each group of duplicates
# Function to count NULL values in each row
def count_nulls(row):
    return row.isnull().sum()
# Removing the duplicates
df = df.groupby('Pokemon Id').apply(
    lambda group: group.loc[group.apply(count_nulls, axis=1).idxmin()]
).reset_index(drop=True)

# Strip out double quotes
for column in df.select_dtypes(include=['object']):  # 'object' is the dtype for text columns
    df[column] = df[column].str.strip('"')

# Dropping other duplicates
df = df.drop_duplicates()

  df = df.groupby('Pokemon Id').apply(


In [41]:
#Add the value "Trade of Friendship" for entries that aren't populated
condition = df['Pre-Evolution Pokemon Id'].notnull() & df['Evolution Details'].isnull()
df.loc[condition, 'Evolution Details'] = 'Trade or Friendship'

# Derive new variables
# Create new variable 'Level of Evolution' if Pokemon can evolve by leveling up
df['Level of Evolution'] = df['Evolution Details'].str.extract(r'^Level (\d+)$').astype(float)

# Combine the male and female ratios into a single attribute
df['M:F Ratio'] = df.apply(lambda row: f"{row['Male Ratio']}:{row['Female Ratio']}", axis=1)

# Reorder columns
print(df.columns)
df = df[['Pokemon Id', 'Pokedex Number', 'Pokemon Name', 'Game(s) of Origin', 'Generation',
        'Alternate Form Name', 'Legendary Status', 'Legendary Type', 'Pre-Evolution Pokemon Id',
        'Evolution Details', 'Level of Evolution', 'Pokemon Height',
        'Pokemon Weight', 'Primary Type', 'Secondary Type', 'Primary Ability',
        'Secondary Ability', 'Hidden Ability', 'Special Event Ability',
        'Male Ratio', 'Female Ratio', 'M:F Ratio', 'Base Happiness', 'Health Stat', 'Attack Stat', 'Defense Stat', 'Special Attack Stat',
        'Special Defense Stat', 'Speed Stat', 'Base Stat Total', 'Health EV',
        'Attack EV', 'Defense EV', 'Special Attack EV', 'Special Defense EV',
        'Speed EV', 'EV Yield Total', 'Catch Rate', 'Experience Growth',
        'Experience Growth Total', 'Primary Egg Group', 'Secondary Egg Group',
        'Egg Cycle Count']]

Index(['Pokemon Id', 'Pokedex Number', 'Pokemon Name', 'Classification',
       'Alternate Form Name', 'Legendary Type', 'Pokemon Height',
       'Pokemon Weight', 'Primary Type', 'Secondary Type', 'Primary Ability',
       'Secondary Ability', 'Hidden Ability', 'Special Event Ability',
       'Male Ratio', 'Female Ratio', 'Base Happiness', 'Game(s) of Origin',
       'Health Stat', 'Attack Stat', 'Defense Stat', 'Special Attack Stat',
       'Special Defense Stat', 'Speed Stat', 'Base Stat Total', 'Health EV',
       'Attack EV', 'Defense EV', 'Special Attack EV', 'Special Defense EV',
       'Speed EV', 'EV Yield Total', 'Catch Rate', 'Experience Growth',
       'Experience Growth Total', 'Primary Egg Group', 'Secondary Egg Group',
       'Egg Cycle Count', 'Pre-Evolution Pokemon Id', 'Evolution Details',
       'Generation', 'Legendary Status', 'Level of Evolution', 'M:F Ratio'],
      dtype='object')


In [42]:
# Filter out Pokemon from the 5th generation and before
df = df[(df['Generation'] <= 5) & (df['Generation'] != 0)]

# Remove Gigantamax Pokemon
df = df[df['Alternate Form Name'] != 'Gigantamax']

# Remove Pokemon from games after Black 2
df = df[df['Game(s) of Origin'] != 'Legends Arceus']
df = df[df['Game(s) of Origin'] != "Let's Go Pikachu"]
df = df[df['Game(s) of Origin'] != 'Scarlet']
df = df[df['Game(s) of Origin'] != 'Sun']
df = df[df['Game(s) of Origin'] != 'Sword']
df = df[df['Game(s) of Origin'] != 'X']
df = df[df['Game(s) of Origin'] != 'Omega Ruby']

# Remove spaces from values of certain attributes
df[primaryEggGroup] = df[primaryEggGroup].str.replace(' ', '', regex=False)
df[primaryEggGroup] = df[primaryEggGroup].str.replace('NoEggsDiscovered', 'No Eggs', regex=False)
df[primaryEggGroup] = df[primaryEggGroup].str.replace('Human-Like', 'Human Like', regex=False)
df[secondaryEggGroup] = df[secondaryEggGroup].str.replace(' ', '', regex=False)
df[secondaryEggGroup] = df[secondaryEggGroup].str.replace('Human-Like', 'Human Like', regex=False)
df[gameOfOrigin] = df[gameOfOrigin].str.replace(' ', '', regex=False)

# Assign category as NA if a categorical variable has no value
df[categoricalVariablesWithNA] = df[categoricalVariablesWithNA].fillna("NA")

# Updates categories for evolution details
# Iterate over the rows in the DataFrame
for idx, row in df.iterrows():
    # Get the evolution detail, ensuring it's a string and lowercase
    evolutionDetail = str(row[evolutionDetails]).lower()
    print(evolutionDetail)
    
    # Apply categorization based on the evolution detail
    if "level" in evolutionDetail:
        df.at[idx, evolutionDetails] = levelingUpCategory
        # print(df[idx, evolutionDetails])
    elif "stone" in evolutionDetail:
        df.at[idx, evolutionDetails] = stoneCategory
    elif "trade" in evolutionDetail:
        df.at[idx, evolutionDetails] = tradeOrFriendshipCategory
    elif evolutionDetail == na:
        continue
    else:
        df.at[idx, evolutionDetails] = na

    legendary_type_value = str(row[legendaryType]).lower()
    if "sub-legendary" in legendary_type_value:
        df.at[idx, legendaryType] = "Sub Legendary"

print(df.columns)

# Export the data
df.to_csv('new_data.csv', index=False, encoding='utf-8-sig')

na
level 16
level 32
na
level 16
level 36
na
level 16
level 36
level 16
moon stone
level 36
na
level 15
level 30
level 7
level 10
level 10
na
level 7
na
na
na
na
level 14
na
na
na
na
na
na
na
level 16
moon stone
na
trade or friendship
moon stone
na
fire stone
na
trade or friendship
moon stone
na
level 22
trade or friendship
na
level 21
leaf stone
sun stone
na
level 24
na
level 31
na
level 26
na
level 18
level 36
na
level 20
na
level 20
na
level 22
na
trade or friendship
thunder stone in kanto (lgpe) or ultra space (usum)
na
level 22
na
level 28
na
level 30
na
leaf stone
na
level 28
level 20 with attack > defense
level 20 with attack < defense
na
na
level 35
na
level 42
holding an oval stone during the day
na
na
na
level 32
na
level 33
na
water stone
knowing mimic
na
level 30
level 30
level 30
na
na
na
level 20
na
na
water stone
thunder stone
fire stone
na
na
level 40
na
level 40
na
trade or friendship
na
na
na
na
level 30
level 55
na
level 16
level 32
na
level 14
level 36
na
level 18
l