In [2]:
#IMPORTS
import pandas as pd

In [3]:
#To read the data
DC = pd.read_csv('DC_data.csv')
Marvel = pd.read_csv('Marvel_data.csv')

In [5]:
#To drop na values
DC.dropna(inplace=True)
Marvel.dropna(inplace=True)

In [6]:
#As some characters appear in multiple universes, the website seperated them with parenthesis (i.e: Batman (The Batman tv series))
#The function below is tasked to remove those strings

def remove_par(universe):
    for char in universe['Names']:
        if '(' in char:
            new_name = char.split(' (')[0]
            (universe['Names'].replace(char, new_name, inplace=True))

    return(universe)

DC = remove_par(DC)
Marvel = remove_par(Marvel)


In [7]:
#DC in-dept cleaning

#Exceptions to be cleaned
#1- 'A.M.A.Z.O' has 3 duplicates however, the third one has an aditional dot at the end. The code below removes it
DC.loc[DC['Names'] == 'A.M.A.Z.O.', 'Names'] = 'A.M.A.Z.O'

#While cleaning the data, a 4th A.M.A.Z.O was spotted but named differently
DC.loc[DC['Names'] == 'Amazo', 'Names'] = 'A.M.A.Z.O'

#2- 'Joker', 'Flash' and 'Riddler' appear a second time in the dataframe with the determiner "the" at the beginning, the line below removes it
DC.loc[DC['Names'] == 'The Joker', 'Names'] = 'Joker'
DC.loc[DC['Names'] == 'The Flash', 'Names'] = 'Flash'
DC.loc[DC['Names'] == 'The Riddler', 'Names'] = 'Riddler'


In [8]:
#Marvel in-dept cleaning

#Exceptions to be cleaned
#1- A typo was found in a duplicate of 'Ikaris'
Marvel.loc[Marvel['Names'] == 'Ikari', 'Names'] = 'Ikaris'

In [9]:
#The webscrapping source seperated the characters by universe. In other words, there are duplicates. 
#To resolved that issue an average of all the stats will be made
#A new df is returned with 2 decimals

def duplicates(universe):
    universe = universe.groupby(['Names']).mean().round(2)
    universe.pop('Unnamed: 0')
    universe.pop('Tier')
    universe.reset_index(inplace = True)
    return (universe)

DC = duplicates(DC)
Marvel = duplicates(Marvel)

In [10]:
#To export the final data into a csv file
DC.to_csv('DC', index = False)
Marvel.to_csv('Marvel', index = False)

In [28]:
#concat both datasets into one "all_data"
all_data = pd.concat([DC, Marvel])
print(len(all_data))

1860


In [18]:
#To create an empty dataframe to store the data
battles = pd.DataFrame()

#To loop through all the battle simulations and concat it to one 
for  i in range(1,53):
    battles = pd.concat([battles, pd.read_csv(f'All_battle_results/battle_results_{i}.csv')])

In [25]:
#To merge all battle results in order to only keep the data of superheroes that are also present in the 'all_data' dataframe
#By dropping na, we simulataneously clean out the data
battles = battles.merge(all_data, how = 'left', left_on = 'Name1', right_on = 'Names').dropna()
battles = battles.merge(all_data, how = 'left', left_on = 'Name2', right_on = 'Names').dropna()


In [27]:
#To export data into csv file
battles.to_csv('battles_data.csv', index = False)