In [1]:
## Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
## Importing the OS and JSON Modules
import os,json

In [2]:
super_powers = pd.read_csv('Data/superhero_powers - superhero_powers.csv')
super_powers.head(3)

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."


In [3]:
super_info = pd.read_csv('Data/superhero_info - superhero_info.csv')
super_info.head(3)

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"


# The Task

Your task is two-fold:

I. Clean the files and combine them into one final DataFrame.

    This dataframe should have the following columns:
        Hero (Just the name of the Hero) 
        Publisher
        Gender
        Eye color
        Race
        Hair color
        Height (numeric)
        Skin color
        Alignment
        Weight (numeric)
        Plus, one-hot-encoded columns for every power that appears in the dataset. E.g.:
            Agility
            Flight
            Superspeed
            etc.

Hint: There is a space in "100 kg" or "52.5 cm"

## seperate the publisher from the hero name column

In [4]:

super_info[['Hero Name','Publisher']] = super_info['Hero|Publisher'].str.split('|', expand=True)
super_info = super_info.drop('Hero|Publisher', axis=1)
super_info.head(3)

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero Name,Publisher
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics


## Seperate the measurements column into height and weight numerical columns

In [5]:
#select one entry of the Measurements column
measurements = super_info.loc[0,'Measurements']
print(type(measurements))
measurements

<class 'str'>


"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"

In [6]:
#replace the single quotes with double quotes for use in JSON
measurements = measurements.replace("'",'"')
measurements

'{"Height": "203.0 cm", "Weight": "441.0 kg"}'

In [7]:
## now we can use JSON.loads to change from string to dictionary
fixed_measurements = json.loads(measurements)
print(type(fixed_measurements))
fixed_measurements

<class 'dict'>


{'Height': '203.0 cm', 'Weight': '441.0 kg'}

In [8]:
#now we can apply it to the whole Measurements column
#replace all sinlge quotes
super_info['Measurements'] = super_info['Measurements'].str.replace("'",'"')
#use JSON.load to change entire column to dictionaries
super_info['Measurements'] = super_info['Measurements'].apply(json.loads)
super_info['Measurements'].head(3)

0    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
1     {'Height': '191.0 cm', 'Weight': '65.0 kg'}
2     {'Height': '185.0 cm', 'Weight': '90.0 kg'}
Name: Measurements, dtype: object

## Unpack measurement dictionaries into Height and Weight numerical columns

In [9]:
height_weight = super_info['Measurements'].apply(pd.Series)
height_weight

Unnamed: 0,Height,Weight
0,203.0 cm,441.0 kg
1,191.0 cm,65.0 kg
2,185.0 cm,90.0 kg
3,203.0 cm,441.0 kg
4,193.0 cm,122.0 kg
...,...,...
458,183.0 cm,83.0 kg
459,165.0 cm,52.0 kg
460,66.0 cm,17.0 kg
461,170.0 cm,57.0 kg


In [10]:
#let's move the units up to the column name
height_weight.rename(columns={'Height':'Height (cm)', 'Weight':'Weight (kg)'}, inplace=True)
height_weight

Unnamed: 0,Height (cm),Weight (kg)
0,203.0 cm,441.0 kg
1,191.0 cm,65.0 kg
2,185.0 cm,90.0 kg
3,203.0 cm,441.0 kg
4,193.0 cm,122.0 kg
...,...,...
458,183.0 cm,83.0 kg
459,165.0 cm,52.0 kg
460,66.0 cm,17.0 kg
461,170.0 cm,57.0 kg


In [11]:
#next, we will replace the units with empty space using for loops
#characters to replace
to_replace = [' cm',' kg']
# run a loop to replace all of the characters in the list at once
for char in to_replace:
    height_weight['Height (cm)'] = height_weight['Height (cm)'].str.replace(char,'',regex=False)
    height_weight['Weight (kg)'] = height_weight['Weight (kg)'].str.replace(char,'',regex=False)

height_weight.head()

Unnamed: 0,Height (cm),Weight (kg)
0,203.0,441.0
1,191.0,65.0
2,185.0,90.0
3,203.0,441.0
4,193.0,122.0


In [12]:
#concat the new measurement columns to the super_info dataframe
super_info = pd.concat((super_info, height_weight),axis=1)
#drop the old Measurements column
super_info = super_info.drop(columns=['Measurements'])
super_info.head(3)

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero Name,Publisher,Height (cm),Weight (kg)
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,90.0


## OHE the power sets

In [13]:
super_powers.head(3)

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."


In [14]:
super_powers.loc[0,'Powers']

'Agility,Super Strength,Stamina,Super Speed'

In [15]:
super_powers['powers_split'] = super_powers['Powers'].str.replace(",",'"')
super_powers.head(3)

Unnamed: 0,hero_names,Powers,powers_split
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed","Agility""Super Strength""Stamina""Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...","Accelerated Healing""Durability""Longevity""Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du...","Agility""Accelerated Healing""Cold Resistance""Du..."


In [16]:
super_powers.loc[0,'powers_split']

'Agility"Super Strength"Stamina"Super Speed'