# Applying Advanced Transformations (Core)- Amber Kutscher
- Data Enrichment


In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the OS and JSON Modules
import os,json

In [2]:
df_info = pd.read_csv('Data/superhero_info - superhero_info.csv')
df_info.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


In [3]:
df_powers = pd.read_csv('Data/superhero_powers - superhero_powers.csv')
df_powers.head()

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


In [4]:
# Clean and preprocess the data in df_info
# Split the 'Hero|Publisher' column into 'Hero' and 'Publisher'
df_info[['Hero', 'Publisher']] = df_info['Hero|Publisher'].str.split('|', expand=True)

# Remove the 'Hero|Publisher' column
df_info = df_info.drop(columns=['Hero|Publisher'])

In [5]:
# Extract the 'Height' and 'Weight' from the 'Measurements' column
df_info['Height (cm)'] = df_info['Measurements'].str.extract(r"'Height': '([0-9.]+) cm'")
df_info['Weight (kg)'] = df_info['Measurements'].str.extract(r"'Weight': '([0-9.]+) kg'")

# Convert 'Height' and 'Weight' to numeric
df_info['Height (cm)'] = pd.to_numeric(df_info['Height (cm)'])
df_info['Weight (kg)'] = pd.to_numeric(df_info['Weight (kg)'])

# Drop the 'Measurements' column
df_info = df_info.drop(columns=['Measurements'])

In [6]:
# Preview the df_info dataframe to ensure that columns were split and dropped accordingly
df_info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height (cm),Weight (kg)
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,90.0
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0,441.0
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0,122.0


In [7]:
# Clean and preprocess the data in df_powers
# Rename the columns for merging
df_powers.rename(columns={'hero_names': 'Hero'}, inplace=True)

In [8]:
# Split the 'Powers' column at the comma to create new columns
df_powers['Powers'] = df_powers['Powers'].str.split(',')

In [9]:
# Merge the two dataframes on the 'Hero' column
merged_df = pd.merge(df_info, df_powers, on='Hero', how='left')

In [10]:
# Create binary columns for each unique power and initialize them to 0 in merged_df
unique_powers = set(power for powers in df_powers['Powers'] for power in powers)

# Initialize a dictionary to hold the binary columns
binary_columns = {power: [0] * len(merged_df) for power in unique_powers}

In [11]:
# Concatenate the binary columns to the merged_df DataFrame
merged_df = pd.concat([merged_df] + [pd.DataFrame(binary_columns)], axis=1)

In [12]:
# Set the binary columns to 1 for heroes with the corresponding power in merged_df
for index, row in df_powers.iterrows():
    hero = row['Hero']
    powers = row['Powers']
    merged_df.loc[merged_df['Hero'] == hero, powers] = 1

In [13]:
# Now, we have the final dataframe
final_dataframe = merged_df
final_dataframe.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height (cm),Weight (kg),...,Invisibility,Weapons Master,Power Cosmic,Electrokinesis,Darkforce Manipulation,Banish,Astral Travel,Accelerated Healing,Odin Force,Sub-Mariner
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,0,0,0,0,0,0,0,1,0,0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0,...,0,1,0,0,0,0,0,1,0,1
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,90.0,...,0,0,0,0,0,0,0,0,0,0
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0,441.0,...,0,0,0,0,0,0,0,1,0,0
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0,122.0,...,0,0,0,0,0,0,0,0,0,0


## Compare the average weight of super powers who have Super Speed to those who do not.?

In [14]:
# Calculate the average weight for superheroes with and without "Super Speed"
average_weight_with_super_speed = final_dataframe[final_dataframe['Super Speed'] == 1]['Weight (kg)'].mean().round(2)
average_weight_without_super_speed = final_dataframe[final_dataframe['Super Speed'] == 0]['Weight (kg)'].mean().round(2)

print(f'Average Weight (kg) of Superheroes with Super Speed: {average_weight_with_super_speed}')
print(f'Average Weight (kg) of Superheroes without Super Speed: {average_weight_without_super_speed}')

Average Weight (kg) of Superheroes with Super Speed: 129.4
Average Weight (kg) of Superheroes without Super Speed: 101.77


## What is the average height of heroes for each publisher?

In [15]:
# Calculate the average height of heroes for each publisher
average_height_by_publisher = final_dataframe.groupby('Publisher')['Height (cm)'].mean().round(2)
print('\nAverage Height (cm) of Heroes for Each Publisher:')
print(average_height_by_publisher)


Average Height (cm) of Heroes for Each Publisher:
Publisher
DC Comics            181.92
Dark Horse Comics    176.91
George Lucas         159.60
Image Comics         211.00
Marvel Comics        191.55
Shueisha             171.50
Star Trek            181.50
Team Epic TV         180.75
Unknown              178.00
Name: Height (cm), dtype: float64
