# EDA and Data Cleaning of Marvel Characters' Data

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [48]:
df = pd.read_csv('marvelscraping/characters.csv')

In [49]:
df.head()

Unnamed: 0,Name,Gender,Marital_Status,Height,Weight,Eye_Color,Hair_Color,Living_Status,Reality,Birthplace,Identity,Citizenship,First_Appearance,Appearances
0,6-Ball (Earth-616),Male,Single,,,,,Alive,Earth-616,,Secret,American,"July, 1991",1.0
1,6R (Earth-616),Agender,,,,,,Deceased,Earth-616,Sentinel Headquarters,,,"July, 1969",1.0
2,762 (Legion Personality) (Earth-616),Male,Single,,,,Black,Alive,Earth-616,,Secret,,"June, 2010",1.0
3,627 (Skullbot) (Earth-12041),Agender,,,,Blue,No Hair At All,Deceased,Earth-12041,,Public,,"June 27, 2017",1.0
4,749 (Legion Personality) (Earth-616),Male,Single,,,,Black,Alive,Earth-616,Legion,Secret,,"June, 2011",1.0


In [50]:
df.shape

(74350, 14)

In [51]:
df.dtypes

Name                 object
Gender               object
Marital_Status       object
Height              float64
Weight              float64
Eye_Color            object
Hair_Color           object
Living_Status        object
Reality              object
Birthplace           object
Identity             object
Citizenship          object
First_Appearance     object
Appearances         float64
dtype: object

In [52]:
df.isna().sum() / len(df) * 100

Name                 0.000000
Gender               2.793544
Marital_Status      55.166106
Height              92.968393
Weight              93.513114
Eye_Color           54.711500
Hair_Color          25.973100
Living_Status        0.024210
Reality              0.217888
Birthplace          88.133154
Identity            14.707465
Citizenship         28.574311
First_Appearance     8.264963
Appearances          6.359112
dtype: float64

In [53]:
df['Gender'].unique()

array(['Male', 'Agender', 'Female', nan, 'Non-Binary', 'Genderfluid',
       'Male/Female', 'Gestalt', 'female', 'Single', 'Transgender'],
      dtype=object)

In [54]:
df['Gender'].replace({'female' : 'Female', 'Male/Female' : np.nan, 'Single' : np.nan}, inplace = True)
df['Gender'].unique()

array(['Male', 'Agender', 'Female', nan, 'Non-Binary', 'Genderfluid',
       'Gestalt', 'Transgender'], dtype=object)

In [55]:
df['Marital_Status'].unique()

array(['Single', nan, 'Married', 'Widowed', 'Separated', 'Engaged',
       'Divorced', 'Estranged', 'Windowed', 'single', 'Unknown.', 'Blond',
       'Unnamed', 'Divorces', 'Unknown',
       'Married\x7f\'"`UNIQ--ref-00000009-QINU`"\'\x7f'], dtype=object)

In [56]:
df['Marital_Status'].replace({'Windowed' : 'Widowed', 'single' : 'Single', 'Unknown.' : np.nan,
                            'Blond' : np.nan, 'Unnamed' : np.nan, 
                            'Divorces' : 'Divorced', 'Unknown' : np.nan, 
                            'Married\x7f\'"`UNIQ--ref-00000009-QINU`"\'\x7f' : 'Married'},
                    inplace = True)
df['Marital_Status'].unique()

array(['Single', nan, 'Married', 'Widowed', 'Separated', 'Engaged',
       'Divorced', 'Estranged'], dtype=object)

In [57]:
df['Eye_Color'].unique()

array([nan, 'Blue', 'Red', 'Yellow', 'Brown', 'No Eyes At All', 'Grey',
       'White', 'Black', 'Hazel', 'Green', 'Gold', 'Purple', 'Pink',
       'Orange', 'Variable', 'Dark', 'Amber', 'Fair', 'Violet', 'Silver',
       'black', 'no visible pupils', 'Magenta', 'Compound', 'No Visible',
       'Brown-Grey', 'blue', 'yellow', 'brown',
       'no visible pupil or iris', 'red', 'No Iris', 'white',
       'No visible Irises or Pupils', 'hazel'], dtype=object)

In [58]:
df['Eye_Color'] = df['Eye_Color'].apply(lambda x : str(x).capitalize())
df['Eye_Color'].unique()

array(['Nan', 'Blue', 'Red', 'Yellow', 'Brown', 'No eyes at all', 'Grey',
       'White', 'Black', 'Hazel', 'Green', 'Gold', 'Purple', 'Pink',
       'Orange', 'Variable', 'Dark', 'Amber', 'Fair', 'Violet', 'Silver',
       'No visible pupils', 'Magenta', 'Compound', 'No visible',
       'Brown-grey', 'No visible pupil or iris', 'No iris',
       'No visible irises or pupils'], dtype=object)

In [59]:
df['Eye_Color'].replace({'Nan' : np.nan, 'Fair' : np.nan, 'Dark' : np.nan, 
                        'No visible pupils' : 'No visible iris and/or pupils',
                        'No visible' : 'No visible iris and/or pupils',
                        'No visible pupil or iris' : 'No visible iris and/or pupils',
                        'No iris' : 'No visible iris and/or pupils',
                        'No visible irises or pupils' : 'No visible iris and/or pupils'},
                        inplace = True)
df['Eye_Color'].unique()

array([nan, 'Blue', 'Red', 'Yellow', 'Brown', 'No eyes at all', 'Grey',
       'White', 'Black', 'Hazel', 'Green', 'Gold', 'Purple', 'Pink',
       'Orange', 'Variable', 'Amber', 'Violet', 'Silver',
       'No visible iris and/or pupils', 'Magenta', 'Compound',
       'Brown-grey'], dtype=object)

In [60]:
df['Hair_Color'].unique()

array([nan, 'Black', 'No Hair At All', 'Blond', 'Bald', 'Grey', 'Brown',
       'White', '[1]', 'Red', 'Pink', 'Green', 'Auburn', 'Gold', 'Purple',
       'Blue', 'Strawberry Blond', 'Silver', 'Orange', 'Dyed', 'Variable',
       'Yellow', 'Shaved', 'Light Brown', 'green', 'Magenta', '[3]',
       'red', 'Platinum Blond', '[2]', 'bald', 'Hepzibah', 'black',
       'grey', 'brown', '[22]', '[37]', '[4]', '[11]', 'white', 'blue',
       '[9]', 'green hair', 'Orange-brown', 'dyed', 'pink', 'shaved'],
      dtype=object)

In [61]:
df['Hair_Color'] = df['Hair_Color'].apply(lambda x : str(x).capitalize())
df['Hair_Color'].unique()

array(['Nan', 'Black', 'No hair at all', 'Blond', 'Bald', 'Grey', 'Brown',
       'White', '[1]', 'Red', 'Pink', 'Green', 'Auburn', 'Gold', 'Purple',
       'Blue', 'Strawberry blond', 'Silver', 'Orange', 'Dyed', 'Variable',
       'Yellow', 'Shaved', 'Light brown', 'Magenta', '[3]',
       'Platinum blond', '[2]', 'Hepzibah', '[22]', '[37]', '[4]', '[11]',
       '[9]', 'Green hair', 'Orange-brown'], dtype=object)

In [62]:
df['Hair_Color'].replace({'Nan' : np.nan, '[1]' : np.nan, '[3]' : np.nan, '[2]' : np.nan, 'Hepzibah' : np.nan, 
                        '[22]' : np.nan, '[37]' : np.nan, '[4]' : np.nan, '[11]' : np.nan, '[9]' : np.nan,
                        'Green hair' : 'Green'},
                        inplace = True)
df['Hair_Color'].unique()

array([nan, 'Black', 'No hair at all', 'Blond', 'Bald', 'Grey', 'Brown',
       'White', 'Red', 'Pink', 'Green', 'Auburn', 'Gold', 'Purple',
       'Blue', 'Strawberry blond', 'Silver', 'Orange', 'Dyed', 'Variable',
       'Yellow', 'Shaved', 'Light brown', 'Magenta', 'Platinum blond',
       'Orange-brown'], dtype=object)

In [63]:
df['Living_Status'].unique()

array(['Alive', 'Deceased', nan], dtype=object)

In [64]:
df['Reality'].unique()

array(['Earth-616', 'Earth-12041', 'Earth-12772', ..., 'Earth-89947',
       'Earth-14114', 'Earth-97449'], dtype=object)

In [65]:
[reality for reality in list(df['Reality'].unique()) if str(reality)[:6] != 'Earth-']

['Multiverse',
 'Ideaverse',
 'Unknown Reality',
 'Otherworld',
 nan,
 'Land of Cancelled Heroes',
 'Mojoverse',
 'Omniverse',
 'Styrakos',
 'First Cosmos',
 'Brilliant City',
 'Utopian Parallel',
 'Elsewhen']

In [66]:
df['Reality'].replace({'Unknown Reality' : np.nan}, inplace = True)
[reality for reality in list(df['Reality'].unique()) if str(reality)[:6] != 'Earth-']

['Multiverse',
 'Ideaverse',
 nan,
 'Otherworld',
 'Land of Cancelled Heroes',
 'Mojoverse',
 'Omniverse',
 'Styrakos',
 'First Cosmos',
 'Brilliant City',
 'Utopian Parallel',
 'Elsewhen']

In [67]:
df['Birthplace'].unique()

array([nan, 'Sentinel Headquarters', 'Legion', ..., 'Ghudaza', 'Sumeria',
       'Meroê'], dtype=object)

In [68]:
df['Birthplace'].nunique()

1246

In [69]:
df['Identity'].unique()

array(['Secret', nan, 'Public', 'No Dual', 'Known to Authorities',
       'No  Dual', 'Human', '[1]'], dtype=object)

In [70]:
df['Identity'].replace({'No Dual' : 'No Dual Identity', 'No  Dual' : 'No Dual Identity', 'Human' : np.nan, '[1]' : np.nan},
                        inplace = True)
df['Identity'].unique()

array(['Secret', nan, 'Public', 'No Dual Identity',
       'Known to Authorities'], dtype=object)

In [71]:
df['Citizenship'].unique()

array(['American', nan, 'Chronicoms', 'Turanian', 'Afghan',
       'Saudi Arabian', 'Belgian', 'British', 'Dutch', 'Polish', 'German',
       'Canadian', 'Canada', 'Latverian', 'Atlantean', 'Scottish',
       'English', 'Krakoan', 'South African', 'Wakandan', 'Egyptian',
       'Azerbaijani', 'Zamoran', 'Aqiria', 'Saudi Arabia', 'Indian',
       'Aakon', 'Vietnamese', 'Austrian', 'Japanese', 'Xandarian',
       'New Canaanite', 'Ethiopia', 'Tanzanian', 'African', 'Germany',
       'Alfheim', 'Kymellian', 'Spanish', 'Undying', 'Kree Empire',
       'Mycenaean', 'Greek', 'Symbiotes', 'Attilan', 'Australian',
       'French', 'Asgardian', 'Vanaheim', 'Jotunheim', 'Irish',
       "Shi'ar Empire", 'Nemedian', 'Skrull Empire', 'Italian', 'Polaria',
       'Japan', 'Wakanda', 'Shemite', 'Stygian', 'United States',
       'Southern Isles', 'Argive', 'Olympia', 'Kenya', 'Norway',
       'Madripoor', 'Deviant Lemuria', 'Mayan', 'Xibalba', 'Halwan',
       'Rejects', 'Breakworldians', 'The Hive',

In [72]:
df['Citizenship'].nunique()

732

In [73]:
df['Citizenship'] = df['Citizenship'].apply(lambda x : str(x).strip())
df['Citizenship'].replace({'nan' : np.nan}, inplace = True)
df['Citizenship'].nunique()

732

In [74]:
df['First_Appearance'].unique()

array(['July, 1991', 'July, 1969', 'June, 2010', ..., 'Damballah',
       'July 22, 2018', 'April 15, 2017'], dtype=object)

In [75]:
df['First_Appearance'].nunique()

2167

In [76]:
df['First_Appearance'] = df['First_Appearance'].apply(lambda x : str(x).strip())
df['First_Appearance'].replace({'nan' : np.nan}, inplace = True)
df['First_Appearance'].nunique()

2167

In [77]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
months.extend([f'{month},' for month in months])
[date for date in list(df['First_Appearance'].unique()) if str(date).split()[0] not in months]

[nan,
 'The Huntsman',
 'S2E04',
 'S1E08',
 'Yazatas',
 'S1E13',
 'S2E01',
 '2000',
 'Black Widow: Forever Red',
 'Amalgam Comics (Trading Cards)',
 '2001',
 'Ronin',
 'Monolith',
 'Khonshu',
 'Unknown',
 'S1E12',
 '1977',
 '1984',
 'S1E05',
 'Shuma-Gorath',
 'S1E03',
 '1980',
 'Iron Man',
 'The Bent Bullet Report',
 'S2E03',
 'Army of Darkness: Ashes 2 Ashes #1',
 'Demogorge',
 'Inhumans',
 'S1E04',
 'S1E21',
 'Totem',
 '2010',
 '1967',
 'S2E09',
 'Spider-Man',
 '1993',
 'Age of X Communiques: Berserker, Hellion, Nightmare',
 'S2E02',
 'S1E19',
 'Wolverine: Weapon X Vol 1 12',
 '2009',
 '2008',
 '2015',
 'S2E15',
 'Odin',
 'S2E16',
 '2014',
 'S3E12',
 'S1E25',
 'Breaker-Apart',
 '2003',
 'S1E02',
 'Loki',
 'S2E14',
 'Poisons',
 '1988',
 'S1E11',
 '#5',
 'S2E06',
 'Liberty Scouts Comics #2',
 '2005',
 'Apollo',
 'Cormac of Connacht',
 'Spider-Man: Arachnophobia',
 '2002',
 'S2E08',
 'S1E06',
 'S2E18',
 'Horus',
 '2007',
 'Age of X Communiques: Frenzy, Magma, Cypher',
 '(Vol. 4) #19',
 

In [78]:
def extract_year(entry):
    try:
        year = int(entry.split()[-1])
        if year >= 1900:
            return year 
        else:
            return np.nan
    except:
        return np.nan

In [79]:
extract_year('2000'), extract_year('July, 1991'), extract_year('July 22, 2018'), extract_year(np.nan), extract_year('(Vol. 2) #2')

(2000, 1991, 2018, nan, nan)

In [80]:
df['Year_Introduced'] = df['First_Appearance'].apply(extract_year)
df['Year_Introduced'].unique()

array([1991., 1969., 2010., 2017., 2011., 2015., 1997., 1993., 1994.,
       2019., 1995., 1999., 2013.,   nan, 1984., 1977., 2020., 1953.,
       1980., 1989., 2003., 2006., 2008., 2018., 2014., 2012., 2007.,
       1967., 1965., 1992., 2005., 2004., 2001., 1974., 1987., 1996.,
       1985., 1972., 2009., 1990., 1955., 1982., 1976., 2000., 1986.,
       1968., 2021., 1988., 1940., 1998., 1978., 1947., 1945., 1975.,
       1983., 1964., 1943., 1941., 2016., 1981., 1970., 1962., 2002.,
       1944., 1942., 1979., 1954., 1973., 1963., 1948., 1961., 1966.,
       1971., 1952., 1960., 1957., 1946., 1939., 1950., 1958., 1956.,
       1959., 1951., 1949., 1936.])

In [81]:
df = df.drop('First_Appearance', axis = 1)
df.shape

(74350, 14)

In [None]:
# Birthplace and Citizenship are not yet fixed

In [82]:
def plot_bar_box(cat):
    fig, ax = plt.subplots(nrows = 2, sharex = False, sharey = False, figsize = (15,15))
    fig.suptitle(f"Marvel Character's {cat}")
    order = list(df[cat].value_counts().index) # orders the categories by count
    sns.countplot(data = df, x = cat, order = order, ax = ax[0])
    sns.boxplot(data = df, x = cat, y = 'Appearances', order = order, ax = ax[1])
    plt.show()