# Working with the dataset

In [88]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("SuperheroDataset.csv")

In [134]:
print(df.columns)

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Name', 'Url', 'Intelligence', 'Strength',
       'Speed', 'Durability', 'Power', 'Combat', 'Full name', 'Alter Egos',
       'Aliases', 'Place of birth', 'First appearance', 'Creator', 'Alignment',
       'Gender', 'Race', 'Height', 'Weight', 'Eye color', 'Hair color',
       'Occupation', 'Base', 'Team Affiliation', 'Relatives', 'Skin color',
       'Total Power'],
      dtype='object')


## Preprocessing the data
* extracting useful features 
* extracting alignment, useful as to determine a superhero is good or bad

In [155]:
df1 = df[df.columns[4:10]] # numerical values
df2 = df[df.columns[16:17]] # alignment
df3 = df[df.columns[17:18]] # Gender
df4 = df[df.columns[21:23]] # Eye and Hair Color

### New dataset
* df1, df3, df4 are inputs 
* combine df1, df3, df4
* df2 is the output

In [156]:
df1.reset_index(drop=True, inplace=True)
df3.reset_index(drop=True, inplace=True)
df4.reset_index(drop=True, inplace=True)
df2.reset_index(drop=True, inplace=True)

In [157]:
all_df = pd.concat([df1, df3, df4, df2], axis=1)

In [158]:
all_df = all_df.dropna()#drops all rows with NaN
all_df = all_df.reset_index(drop=True)
all_df

Unnamed: 0,Intelligence,Strength,Speed,Durability,Power,Combat,Gender,Eye color,Hair color,Alignment
0,80.0,35.0,45.0,35.0,25.0,55.0,Male,Brown,Grey,good
1,75.0,100.0,20.0,80.0,25.0,65.0,Male,Yellow,No Hair,good
2,95.0,30.0,35.0,65.0,100.0,85.0,Male,Blue,No Hair,good
3,80.0,90.0,55.0,65.0,100.0,65.0,Male,Blue,No Hair,good
4,85.0,80.0,55.0,90.0,65.0,95.0,Male,Green,No Hair,bad
5,100.0,100.0,85.0,100.0,100.0,55.0,Male,Blue,Black,bad
6,75.0,80.0,25.0,100.0,100.0,65.0,Male,Blue,No Hair,bad
7,85.0,10.0,15.0,100.0,100.0,65.0,Male,Blue,Blond,good
8,85.0,10.0,35.0,40.0,40.0,50.0,Male,Blue,Blond,good
9,80.0,40.0,45.0,50.0,45.0,85.0,Female,Blue,Blond,good


In [170]:
#extracting x[inputs] and y[output] from all_df
X = all_df[all_df.columns[:9]]
Y = all_df[all_df.columns[9:10]]

### One Hot Encode the Non-numerical data

In [93]:
from sklearn.preprocessing import LabelEncoder

In [94]:
#create fit encode funtion
def fit_encoder(some_df):
    #assign label function
    encoder = LabelEncoder()
    #fit the labels
    encoder.fit(some_df)
    #transforms into encodes
    some_df = encoder.transform(some_df)
    return some_df #return encodeable values

In [95]:
#create one-hot-encoding function
def one_hot_encode(labels):
    #labels count
    n_labels = len(labels)
    #unique labels count
    n_unique_labels = len(np.unique(labels))
    #one hot encode
    one_hot_encode = np.zeros((n_labels, n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

#### One hot encode the Output data

In [96]:
#y = fit_encoder(y)

In [171]:
#Y = one_hot_encode(y)
#using inbuilt method to one hot encode
encoder = LabelEncoder()
Y['Alignment'] = encoder.fit_transform(Y['Alignment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


#### One hot encode the data in inputs

In [172]:
X['Gender'] = encoder.fit_transform(X['Gender'])
X['Eye color'] = encoder.fit_transform(X['Eye color'])
X['Hair color'] = encoder.fit_transform(X['Hair color'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [173]:
#shape
print(X.shape)
print(Y.shape)

(665, 9)
(665, 1)


### Separating train and test data

In [174]:
from sklearn.model_selection import train_test_split 

In [175]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2, random_state=415)

In [176]:
print(train_x.shape)
print(train_y.shape)

(532, 9)
(532, 1)


### Defining the model 
* MLP - multi-layer perceptron model

In [177]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense

In [178]:
#define model
model = Sequential()
model.add(Dense(12, input_dim=9, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [179]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [180]:
model.fit(train_x, train_y, epochs=50, batch_size=10)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fadfc631630>

## Evaluate the model

In [181]:
_, accuracy = model.evaluate(test_x, test_y)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 25.56


## Making predictions

In [184]:
predictions = model.predict(train_x)
rounded = [round(x[0]) for x in predictions]

In [185]:
predicts = model.predict_classes(train_x)

In [186]:
predicts

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
    