# EDA and Data Cleaning of Marvel Characters' Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [14]:
df = pd.read_csv('marvelscraping/characters.csv')

In [4]:
df.head()

Unnamed: 0,Name,Gender,Marital_Status,Height,Weight,Eye_Color,Hair_Color,Living_Status,Reality,Birthplace,Identity,Citizenship,First_Appearance,Appearances
0,6-Ball (Earth-616),Male,Single,,,,,Alive,Earth-616,,Secret,American,"July, 1991",1.0
1,6R (Earth-616),Agender,,,,,,Deceased,Earth-616,Sentinel Headquarters,,,"July, 1969",1.0
2,762 (Legion Personality) (Earth-616),Male,Single,,,,Black,Alive,Earth-616,,Secret,,"June, 2010",1.0
3,627 (Skullbot) (Earth-12041),Agender,,,,Blue,No Hair At All,Deceased,Earth-12041,,Public,,"June 27, 2017",1.0
4,749 (Legion Personality) (Earth-616),Male,Single,,,,Black,Alive,Earth-616,Legion,Secret,,"June, 2011",1.0


In [8]:
df.shape

(74350, 14)

In [9]:
df.dtypes

Name                 object
Gender               object
Marital_Status       object
Height              float64
Weight              float64
Eye_Color            object
Hair_Color           object
Living_Status        object
Reality              object
Birthplace           object
Identity             object
Citizenship          object
First_Appearance     object
Appearances         float64
dtype: object

In [6]:
df.isna().sum() / len(df) * 100

Name                 0.000000
Gender               2.793544
Marital_Status      55.166106
Height              92.968393
Weight              93.513114
Eye_Color           54.711500
Hair_Color          25.973100
Living_Status        0.024210
Reality              0.217888
Birthplace          88.133154
Identity            14.707465
Citizenship         28.574311
First_Appearance     8.264963
Appearances          6.359112
dtype: float64

In [5]:
df['Gender'].unique()

array(['Male', 'Agender', 'Female', nan, 'Non-Binary', 'Genderfluid',
       'Male/Female', 'Gestalt', 'female', 'Single', 'Transgender'],
      dtype=object)

In [6]:
df['Gender'].replace({'female' : 'Female', 'Male/Female' : np.nan, 'Single' : np.nan}, inplace = True)
df['Gender'].unique()

array(['Male', 'Agender', 'Female', nan, 'Non-Binary', 'Genderfluid',
       'Gestalt', 'Transgender'], dtype=object)

In [7]:
df['Marital_Status'].unique()

array(['Single', nan, 'Married', 'Widowed', 'Separated', 'Engaged',
       'Divorced', 'Estranged', 'Windowed', 'single', 'Unknown.', 'Blond',
       'Unnamed', 'Divorces', 'Unknown',
       'Married\x7f\'"`UNIQ--ref-00000009-QINU`"\'\x7f'], dtype=object)

In [8]:
df['Marital_Status'].replace({'Windowed' : 'Widowed', 'single' : 'Single', 'Unknown.' : np.nan,
                            'Blond' : np.nan, 'Unnamed' : np.nan, 
                            'Divorces' : 'Divorced', 'Unknown' : np.nan, 
                            'Married\x7f\'"`UNIQ--ref-00000009-QINU`"\'\x7f' : 'Married'},
                    inplace = True)
df['Marital_Status'].unique()

array(['Single', nan, 'Married', 'Widowed', 'Separated', 'Engaged',
       'Divorced', 'Estranged'], dtype=object)

In [15]:
df['Eye_Color'].unique()

array([nan, 'Blue', 'Red', 'Yellow', 'Brown', 'No Eyes At All', 'Grey',
       'White', 'Black', 'Hazel', 'Green', 'Gold', 'Purple', 'Pink',
       'Orange', 'Variable', 'Dark', 'Amber', 'Fair', 'Violet', 'Silver',
       'black', 'no visible pupils', 'Magenta', 'Compound', 'No Visible',
       'Brown-Grey', 'blue', 'yellow', 'brown',
       'no visible pupil or iris', 'red', 'No Iris', 'white',
       'No visible Irises or Pupils', 'hazel'], dtype=object)

In [16]:
df['Eye_Color'] = df['Eye_Color'].apply(lambda x : str(x).capitalize())
df['Eye_Color'].unique()

array(['Nan', 'Blue', 'Red', 'Yellow', 'Brown', 'No eyes at all', 'Grey',
       'White', 'Black', 'Hazel', 'Green', 'Gold', 'Purple', 'Pink',
       'Orange', 'Variable', 'Dark', 'Amber', 'Fair', 'Violet', 'Silver',
       'No visible pupils', 'Magenta', 'Compound', 'No visible',
       'Brown-grey', 'No visible pupil or iris', 'No iris',
       'No visible irises or pupils'], dtype=object)

In [18]:
df['Eye_Color'].replace({'Nan' : np.nan, 'Fair' : np.nan, 'Dark' : np.nan, 
                        'No visible pupils' : 'No visible iris and/or pupils',
                        'No visible' : 'No visible iris and/or pupils',
                        'No visible pupil or iris' : 'No visible iris and/or pupils',
                        'No iris' : 'No visible iris and/or pupils',
                        'No visible irises or pupils' : 'No visible iris and/or pupils'},
                        inplace = True)
df['Eye_Color'].unique()

array([nan, 'Blue', 'Red', 'Yellow', 'Brown', 'No eyes at all', 'Grey',
       'White', 'Black', 'Hazel', 'Green', 'Gold', 'Purple', 'Pink',
       'Orange', 'Variable', 'Amber', 'Violet', 'Silver',
       'No visible iris and/or pupils', 'Magenta', 'Compound',
       'Brown-grey'], dtype=object)

In [25]:
def plot_bar_box(cat):
    fig, ax = plt.subplots(nrows = 2, sharex = False, sharey = False, figsize = (15,15))
    fig.suptitle(f"Marvel Character's {cat}")
    order = list(df[cat].value_counts().index) # orders the categories by count
    sns.countplot(data = df, x = cat, order = order, ax = ax[0])
    sns.boxplot(data = df, x = cat, y = 'Appearances', order = order, ax = ax[1])
    plt.show()