# Imports

In [1]:
## Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
## Importing the OS and JSON Modules
import os,json

In [2]:
info = pd.read_csv('superhero_info - superhero_info.csv')
info.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


In [3]:
powers = pd.read_csv('superhero_powers - superhero_powers.csv')
powers.head()

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


# The Task
Your task is two-fold:

I. Clean the files and combine them into one final DataFrame.

This dataframe should have the following columns:
* Hero (Just the name of the Hero)
* Publisher
* Gender
* Eye color
* Race
* Hair color
* Height (numeric)
* Skin color
* Alignment
* Weight (numeric)
* Plus, one-hot-encoded columns for every power that appears in the dataset. E.g.:
* Agility
* Flight
* Superspeed
* etc.

In [4]:
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Hero|Publisher  463 non-null    object
 1   Gender          463 non-null    object
 2   Race            463 non-null    object
 3   Alignment       463 non-null    object
 4   Hair color      463 non-null    object
 5   Eye color       463 non-null    object
 6   Skin color      463 non-null    object
 7   Measurements    463 non-null    object
dtypes: object(8)
memory usage: 29.1+ KB


## Splitting Hero and Publisher

In [5]:
info['Hero|Publisher'] 

0               A-Bomb|Marvel Comics
1       Abe Sapien|Dark Horse Comics
2                 Abin Sur|DC Comics
3          Abomination|Marvel Comics
4        Absorbing Man|Marvel Comics
                   ...              
458       Yellowjacket|Marvel Comics
459    Yellowjacket II|Marvel Comics
460                Yoda|George Lucas
461                Zatanna|DC Comics
462                   Zoom|DC Comics
Name: Hero|Publisher, Length: 463, dtype: object

In [6]:
## adding expand=True
info[['Hero','Publisher']]=info['Hero|Publisher'].str.split('|',expand=True)
info.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics


In [7]:
# Move "Hero" and "Publisher" columns to the beginning of the DataFrame
info.insert(0, 'Publisher', info.pop('Publisher'))
info.insert(0, 'Hero', info.pop('Hero'))

In [8]:
info.drop(columns = "Hero|Publisher", inplace = True)
info.head()

Unnamed: 0,Hero,Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb,Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien,Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur,DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination,Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man,Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


## Measurements

In [9]:
## examining a single value from the coordinates col
measurements = info.loc[0,"Measurements"]
print(type(measurements))
measurements

<class 'str'>


"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"

In [10]:
## use .str.replace to replace all single quotes
info['Measurements'] = info['Measurements'].str.replace("'",'"')
## Apply the json.loads to the full column
info['Measurements'] = info['Measurements'].apply(json.loads)
info['Measurements'].head()

0    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
1     {'Height': '191.0 cm', 'Weight': '65.0 kg'}
2     {'Height': '185.0 cm', 'Weight': '90.0 kg'}
3    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
4    {'Height': '193.0 cm', 'Weight': '122.0 kg'}
Name: Measurements, dtype: object

In [11]:
height_weight = info['Measurements'].apply(pd.Series)
height_weight

Unnamed: 0,Height,Weight
0,203.0 cm,441.0 kg
1,191.0 cm,65.0 kg
2,185.0 cm,90.0 kg
3,203.0 cm,441.0 kg
4,193.0 cm,122.0 kg
...,...,...
458,183.0 cm,83.0 kg
459,165.0 cm,52.0 kg
460,66.0 cm,17.0 kg
461,170.0 cm,57.0 kg


In [12]:
# concat long_lat with original dataframe
info = pd.concat((info, height_weight), axis = 1)
info.head(2)

Unnamed: 0,Hero,Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Height,Weight
0,A-Bomb,Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",203.0 cm,441.0 kg
1,Abe Sapien,Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",191.0 cm,65.0 kg


In [13]:
info=info.drop(columns=['Measurements'])

In [14]:
info.head()

Unnamed: 0,Hero,Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Height,Weight
0,A-Bomb,Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,203.0 cm,441.0 kg
1,Abe Sapien,Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,191.0 cm,65.0 kg
2,Abin Sur,DC Comics,Male,Ungaran,good,No Hair,blue,red,185.0 cm,90.0 kg
3,Abomination,Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,203.0 cm,441.0 kg
4,Absorbing Man,Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,193.0 cm,122.0 kg


In [15]:
# convert them to numeric
info['Weight'] = info['Weight'].str.replace(' kg', '').astype(float)
info['Height'] = info['Height'].str.replace(' cm', '').astype(float)

In [16]:
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Hero        463 non-null    object 
 1   Publisher   463 non-null    object 
 2   Gender      463 non-null    object 
 3   Race        463 non-null    object 
 4   Alignment   463 non-null    object 
 5   Hair color  463 non-null    object 
 6   Eye color   463 non-null    object 
 7   Skin color  463 non-null    object 
 8   Height      463 non-null    float64
 9   Weight      463 non-null    float64
dtypes: float64(2), object(8)
memory usage: 36.3+ KB


## One Hot Encode Powers

In [17]:
powers.head()

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


In [18]:
powers.loc[2,'Powers']

'Agility,Accelerated Healing,Cold Resistance,Durability,Underwater breathing,Marksmanship,Weapons Master,Longevity,Intelligence,Super Strength,Telepathy,Stamina,Immortality,Reflexes,Enhanced Sight,Sub-Mariner'

In [19]:
# Split the comma-separated values into a list of powers
powers['Powers_Split'] = powers['Powers'].str.split(',')

In [20]:
powers['Powers_Split'].value_counts()


[Intelligence]                                                                                                                                                                                                                                                                          8
[Durability, Super Strength]                                                                                                                                                                                                                                                            5
[Agility, Stealth, Marksmanship, Weapons Master, Stamina]                                                                                                                                                                                                                               4
[Marksmanship]                                                                                                                                            

In [21]:
## exploding the column of lists
exploded = powers.explode('Powers_Split')
exploded[['hero_names','Powers','Powers_Split']].head(5)

Unnamed: 0,hero_names,Powers,Powers_Split
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Agility
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Super Strength
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Stamina
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Super Speed
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...",Accelerated Healing


In [22]:
## saving the unique values from the exploded column
cols_to_make = exploded['Powers_Split'].dropna().unique()
cols_to_make

array(['Agility', 'Super Strength', 'Stamina', 'Super Speed',
       'Accelerated Healing', 'Durability', 'Longevity', 'Camouflage',
       'Self-Sustenance', 'Cold Resistance', 'Underwater breathing',
       'Marksmanship', 'Weapons Master', 'Intelligence', 'Telepathy',
       'Immortality', 'Reflexes', 'Enhanced Sight', 'Sub-Mariner',
       'Lantern Power Ring', 'Invulnerability', 'Animation',
       'Super Breath', 'Dimensional Awareness', 'Flight', 'Size Changing',
       'Teleportation', 'Magic', 'Dimensional Travel',
       'Molecular Manipulation', 'Energy Manipulation', 'Power Cosmic',
       'Energy Absorption', 'Elemental Transmogrification',
       'Fire Resistance', 'Natural Armor', 'Heat Resistance',
       'Matter Absorption', 'Regeneration', 'Stealth', 'Power Suit',
       'Energy Blasts', 'Energy Beams', 'Heat Generation', 'Danger Sense',
       'Phasing', 'Force Fields', 'Hypnokinesis', 'Invisibility',
       'Enhanced Senses', 'Jump', 'Shapeshifting', 'Elasticity',
 

In [24]:
for col in cols_to_make:
    powers[col] = powers['Powers'].str.contains(col)

  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'].str.contains(col)
  powers[col] = powers['Powers'

In [25]:
# drop columns
powers = powers.drop(columns=['Powers','Powers_Split'])


In [None]:
powers.rename

In [26]:
powers.head()

Unnamed: 0,hero_names,Agility,Super Strength,Stamina,Super Speed,Accelerated Healing,Durability,Longevity,Camouflage,Self-Sustenance,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,3-D Man,True,True,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A-Bomb,False,True,True,False,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,Abe Sapien,True,True,True,False,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Abin Sur,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Abomination,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Merge Data Frames

In [27]:
hero = pd.merge(info, powers, left_on='Hero', right_on='hero_names', how='inner')
hero.head()

Unnamed: 0,Hero,Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Height,Weight,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,A-Bomb,Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
1,Abe Sapien,Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,191.0,65.0,...,False,False,False,False,False,False,False,False,False,False
2,Abin Sur,DC Comics,Male,Ungaran,good,No Hair,blue,red,185.0,90.0,...,False,False,False,False,False,False,False,False,False,False
3,Abomination,Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
4,Absorbing Man,Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,193.0,122.0,...,False,False,False,False,False,False,False,False,False,False


## Compare the average weight of super powers who have Super Speed to those who do not.


In [32]:
# Average weight for superheroes with Super Speed
avg_weight_super_speed = hero[hero['Super Speed'] == 1]['Weight'].mean()

# Average weight for superheroes without Super Speed
avg_weight_no_super_speed = hero[hero['Super Speed'] == 0]['Weight'].mean()

print("average weight for superheroes with Super Speed: ", avg_weight_super_speed,"Kg" )
print("average weight for superheroes without Super Speed: ", avg_weight_no_super_speed,"Kg" )

average weight for superheroes with Super Speed:  129.40404040404042 Kg
average weight for superheroes without Super Speed:  101.77358490566037 Kg


## What is the average height of heroes for each publisher?

In [33]:
average_height_by_publisher = hero.groupby('Publisher')['Height'].mean()

In [34]:
average_height_by_publisher

Publisher
DC Comics            181.923913
Dark Horse Comics    176.909091
George Lucas         159.600000
Image Comics         211.000000
Marvel Comics        191.546128
Shueisha             171.500000
Star Trek            181.500000
Team Epic TV         180.750000
Unknown              178.000000
Name: Height, dtype: float64