In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.plotly as ply
import plotly.offline as py
color = sns.color_palette()
import plotly.graph_objs as go
from plotly import tools
%matplotlib inline

In [2]:
py.init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings('ignore')

In [3]:
# load the superhero information data
superhero_info = pd.read_csv('../data/superhero/heroes_information.csv')

In [4]:
# loading superhero power data
superhero_power = pd.read_csv('../data/superhero/super_hero_powers.csv')

In [5]:
superhero_info.drop(superhero_info.columns[0], axis=1, inplace=True)

In [6]:
superhero_info.head()

Unnamed: 0,name,Gender,Eye color,Race,Hair color,Height,Publisher,Skin color,Alignment,Weight
0,A-Bomb,Male,yellow,Human,No Hair,203.0,Marvel Comics,-,good,441.0
1,Abe Sapien,Male,blue,Icthyo Sapien,No Hair,191.0,Dark Horse Comics,blue,good,65.0
2,Abin Sur,Male,blue,Ungaran,No Hair,185.0,DC Comics,red,good,90.0
3,Abomination,Male,green,Human / Radiation,No Hair,203.0,Marvel Comics,-,bad,441.0
4,Abraxas,Male,blue,Cosmic Entity,Black,-99.0,Marvel Comics,-,bad,-99.0


### Playing with hero information

In [7]:
publish = superhero_info['Publisher'].value_counts()

In [8]:
trace = go.Pie(labels=list(publish.keys()), values=list(publish), hole=0.4)
layout = go.Layout(
    title='Publications Distributions (Comic wise)',
    height=700,
    width=700
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
ply.iplot(fig, filename='Comic_wise_distribution')

Number of Super heroes in DC and Marvel universe seem to be more than any other comics. Lets plot a pie chart for it

In [9]:
publisher = pd.DataFrame({
    'type': publish.index,
    'value': publish.values
})

In [10]:
trace = go.Pie(labels=[publisher.loc[0]['type'], publisher.loc[1]['type'], 'Others'], 
               values=[publisher.loc[0]['value'], publisher.loc[1]['value'], publisher.loc[2:]['value'].sum()], hole=0.4)
layout = go.Layout(
    title='Publications Distributions (DC, Marvel and Others)',
    height=700,
    width=700
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
ply.iplot(fig, filename='Comic_wise_distribution(DC, Marvels and others)')

### Lets make a gender distribution of super heros

In [11]:
superhero_info['Gender'].value_counts()

Male      505
Female    200
-          29
Name: Gender, dtype: int64

In [12]:
superhero_info.Gender.replace(to_replace='-', value='Other', inplace=True)

In [13]:
gender = superhero_info['Gender'].value_counts()

In [14]:
trace = go.Pie(labels=list(gender.index), 
               values=list(gender), hole=0.4)
layout = go.Layout(
    title='Gender wise Distribution',
    height=500,
    width=500
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
ply.iplot(fig, filename='Gender wise')

Males seems to dominate the Superheroes universe. 

### I think Alignment can reveal some more insights

In [15]:
superhero_info['Alignment'].value_counts()

good       496
bad        207
neutral     24
-            7
Name: Alignment, dtype: int64

In [16]:
superhero_info[superhero_info['Alignment']=='-']

Unnamed: 0,name,Gender,Eye color,Race,Hair color,Height,Publisher,Skin color,Alignment,Weight
33,Anti-Venom,Male,blue,Symbiote,Blond,229.0,Marvel Comics,-,-,358.0
110,Blackwulf,Male,red,Alien,White,188.0,Marvel Comics,-,-,88.0
138,Brundlefly,Male,-,Mutant,-,193.0,,-,-,-99.0
426,Man of Miracles,Other,blue,God / Eternal,Silver,-99.0,Image Comics,-,-,-99.0
535,Q,Male,-,God / Eternal,-,-99.0,Star Trek,-,-,-99.0
676,Trickster,Male,blue,Human,Blond,183.0,DC Comics,-,-,81.0
692,Venompool,Male,-,Symbiote,-,226.0,Marvel Comics,-,-,-99.0


** Venom, trickster are not exactly villains rather anti-heros. So I would replace '-' with 'anti-hero' **

In [17]:
superhero_info.Alignment.replace(to_replace='-', value='anti-hero', inplace=True)

In [18]:
trace = go.Pie(labels=list(superhero_info['Alignment'].value_counts().index), 
               values=list(superhero_info['Alignment'].value_counts()), hole=0.4)
layout = go.Layout(
    title='Alignment of Superheroes',
    height=500,
    width=500
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
ply.iplot(fig, filename='Alignment pie')

In [19]:
heroes_gender = superhero_info['Gender'].loc[superhero_info['Alignment']=='good'].value_counts()
villian_gender = superhero_info['Gender'].loc[superhero_info['Alignment']=='bad'].value_counts()
neutral_gender = superhero_info['Gender'].loc[superhero_info['Alignment']=='neutral'].value_counts()
anti_hero_gender = superhero_info['Gender'].loc[superhero_info['Alignment']=='anti-hero'].value_counts()

fig = {
    "data": [
        {
            "labels": list(heroes_gender.index), 
            "values": list(heroes_gender/heroes_gender.sum()*100), 
            "type": "pie",
            "hole": 0.4,
            "name": "Heroes",
            "domain":{'x': [0, 0.48],
                      'y': [0.51, 1]},
            "textinfo": "label"
        },
        {
            "labels": list(villian_gender.index), 
            "values": list(villian_gender/villian_gender.sum()*100), 
            "type": "pie",
            "hole": 0.4,
            "name": "Villians",
            "domain":{'x': [0.52, 1],
                     'y': [0.51, 1]},
            "textinfo": "label"
        },
        {
            "labels": list(neutral_gender.index), 
            "values": list(neutral_gender/neutral_gender.sum()*100), 
            "type": "pie",
            "hole": 0.4,
            "name": "Neutral characters",
            "domain":{'x': [0, 0.48],
                      'y': [0, 0.49]},
            "textinfo": "label"
        },
        {
            "labels": list(anti_hero_gender.index), 
            "values": list(anti_hero_gender/anti_hero_gender.sum()*100), 
            "type": "pie", 
            "hole": 0.4,
            "name": "Anti-hero characters",
            "domain":{'x': [0.52, 1],
                      'y': [0, 0.49]},
            "textinfo": "label"
        }
    ],
    "layout": {
        "title":"Gender Distribution (Alignment wise)",
        "annotations": [
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Heroes",
                "x": 0,
                "y": 1
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Villain",
                "x": 0.6,
                "y": 1
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Neutral",
                "x": 0,
                "y": 0.5
            },
            {
                "font": {
                    "size": 20
                },
                "showarrow": False,
                "text": "Anti-Hero",
                "x": 0.6,
                "y": 0.5
            },
        ]
    }
}


ply.iplot(fig, filename='Gender distribution')


### Distribution of Superheroes by Race

In [36]:
superhero_info['Race'].value_counts().head()

Unknown          304
Human            208
Mutant            63
God / Eternal     14
Cyborg            11
Name: Race, dtype: int64

In [21]:
superhero_info['Race'].replace(to_replace='-', value='Unknown', inplace=True)

In [22]:
race_df = superhero_info.loc[~superhero_info['Race'].isin(['Unknown'])]['Race']

In [23]:
trace = go.Bar(
    x=race_df.value_counts().head(20).index,
    y=race_df.value_counts().head(20),
    name='Races',
    marker={
        'color': race_df.value_counts().head(20),
        'colorscale': 'Viridis'
    }
)

layout = go.Layout(
    title='Distribution of heroes across races (Top 20)',
    barmode='bar'
)

fig = go.Figure(data=[trace], layout=layout)
ply.iplot(fig, filename='Distribution by Race')


In [24]:
bald_or_not = superhero_info['Hair color'].where(superhero_info['Hair color']=="No Hair", other='Hair')

In [25]:
trace = go.Bar(
    x=bald_or_not.value_counts().index,
    y=bald_or_not.value_counts(),
    name='Bald or Not',
    marker={
        "color":['rgba(0,0,0,0.8)', 'rgba(192,0,192,0.9)']
    }
)

layout = go.Layout(
    title='Hair vs No-Hair',
    barmode='bar'
)

fig = go.Figure(data=[trace], layout=layout)
ply.iplot(fig, filename='Hair vs No-Hair')

In [26]:
superhero_info['Hair color'].unique()

array(['No Hair', 'Black', 'Blond', 'Brown', '-', 'White', 'Purple',
       'Orange', 'Pink', 'Red', 'Auburn', 'Strawberry Blond', 'black',
       'Blue', 'Green', 'Magenta', 'Brown / Black', 'Brown / White',
       'blond', 'Silver', 'Red / Grey', 'Grey', 'Orange / White',
       'Yellow', 'Brownn', 'Gold', 'Red / Orange', 'Indigo',
       'Red / White', 'Black / Blue'], dtype=object)

In [27]:
hair_df = superhero_info.loc[~superhero_info['Hair color'].isin(['No Hair','-'])]['Hair color']

In [28]:
data = go.Bar(
    x=hair_df.value_counts().head(10).index,
    y=hair_df.value_counts().head(10),
    name='Hair colors',
    marker={
        'color': hair_df.value_counts().head(10),
        'colorscale': 'Viridis'
    }
)

layout = go.Layout(
    title='Distribution by Hair Colors',
    barmode='bar'
)

fig = go.Figure(data=[data], layout=layout)
ply.iplot(fig, filename='Hair color distribution')

In [29]:
data = go.Pie(
    labels=hair_df.value_counts().head(10).index, 
    values=hair_df.value_counts().head(10), hole=0.4,
    textinfo='value',
    marker=dict(colors=[
        'rgba(0,0,0,0.8)',
        'rgba(243,243,164,0.8)',
        'rgba(165,104,42,0.7)',
        'rgba(255,0,0,0.9)', 
        'rgba(255,255,255,1)',
        'rgba(165,42,42,0.9)',
        'rgba(0,255,0,0.7)',
        'rgba(165,88,29,0.7)',
        'rgba(128,128,128,0.9)',
        'rgba(128,0,128,0.8)', 
        'rgba(216,210,181,1)', 
        'rgba(192,192,192,0.5)', 
        'rgba(0,0,255,0.8)',
        'rgba(255,255,0,1)', 
        'rgba(255,165,0,0.6)',
        'rgba(251,223,214,0.8)',
        'rgba(75,0,130,1)', 
        'rgba(60,42,8,1)',
        'rgba(255,0,255,1)', 
        'rgba(116,68,56,1)',
        'rgba(20,28,27,1)',
        'rgba(255,215,0,0.9)',
        'rgba(255,64,0,1)',
        'rgba(251,194,123,0.7)',
        'rgba(255,192,203,1)',
    ],
        line=dict(
            color='rgb(8,48,107)',
            width=0.3))
)

layout = go.Layout(
    title='Distributions by Hair color',
    height=750,
    width=750
)

fig = go.Figure(data=[data], layout=layout)
ply.iplot(fig, filename='hair color distribution pie chart')

In [30]:
superhero_info['Skin color'].value_counts()

-                 662
green              21
blue                9
red                 9
white               7
silver              5
grey                5
purple              3
gold                3
pink                2
yellow              2
blue-white          1
gray                1
orange / white      1
red / black         1
black               1
orange              1
Name: Skin color, dtype: int64

In [31]:
valid_skin_color = superhero_info.loc[~superhero_info['Skin color'].isin(['-'])]['Skin color'].value_counts()

In [32]:
valid_skin_color

green             21
red                9
blue               9
white              7
silver             5
grey               5
gold               3
purple             3
yellow             2
pink               2
orange / white     1
gray               1
orange             1
black              1
red / black        1
blue-white         1
Name: Skin color, dtype: int64

In [33]:
data = go.Bar(
    x=valid_skin_color.index,
    y=valid_skin_color.head(10),
    name='Skin colors',
    marker={
        'color': valid_skin_color,
        'colorscale': 'Viridis'
    }
)

layout = go.Layout(
    title='Skin Colors',
    barmode='bar'
)

fig = go.Figure(data=[data], layout=layout)
ply.iplot(fig, filename='Skin color distribution')

In [34]:
data = go.Pie(labels=list(valid_skin_color.index), 
               values=list(valid_skin_color),hole=0.4,
              marker=dict(colors=[
            'rgba(34, 226, 20,0.9)',
            'rgba(0,0,255,0.8)',
            'rgba(255,0,0,1)',
            'rgba(255,255,255,0.5)',
            'rgba(128,128,128,0.8))', 
            'rgba(192,192,192,1)', 
            'rgba(128,0,128,0.8)', 
            'rgba(255,215,0,1)', 
            'rgba(255,255,0,0.7)',
            'rgba(255,0,0,0.5)',
            'rgba(0,255,255,0.6)',
            'rgba(255,165,0,0.7)', 
            'rgba(128,0,0,0.9)',
            'rgba(128,128,128,0.6)',
            'rgba(255,140,0,0.8)',
            'rgba(0,0,0,0.8)',
        ],
        line=dict(
            color='rgb(8,48,107)',
            width=0.5))
              )
layout = go.Layout(
    title='Skin Color Distribution',
    height=700,
    width=700
)
fig = go.Figure(data=[data], layout=layout)
ply.iplot(fig, filename='Skin color pie chart')

In [48]:
valid_eye = superhero_info.loc[~superhero_info['Eye color'].isin(['-'])]['Eye color'].value_counts()

In [50]:
data = go.Pie(labels=valid_eye.index, 
               values=valid_eye,hole=0.4,
              marker=dict(
        line=dict(
            color='rgb(8,48,107)',
            width=0.5))
              )
layout = go.Layout(
    title='Eye Color Distribution',
    height=700,
    width=700
)
fig = go.Figure(data=[data], layout=layout)
py.iplot(fig, filename='Eye color pie chart')