In [1]:
#setup
import csv
import numpy
import pandas as pd
from matplotlib import pyplot
from sklearn.preprocessing import Binarizer
from numpy import set_printoptions
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix


''' Load data and create a dataframe '''

filename = 'train_imperson_without4n7_balanced_data.csv'

df = pd.read_csv(filename, header=0, na_values=['nan'])
print(df.head())
print(df.info())
print(df.describe())

''' Delete columns with no values '''

# many features have no values
columnsToDelete = []

# if the  feature has std == 0 it means no values or only one value

for key, value in df.iteritems():
    if df[key].std() == 0:
        columnsToDelete.append(key)

df = df.drop(columnsToDelete, axis=1)
len(columnsToDelete)  # 74 features have been deleted

print(df.head())

df.columns=[i for i in range(1,80)] #rename column heads

X = df.iloc[:, 0:78] #set x variables to be the first 78 columns
y = df.iloc[:,-1] #set y variable to be last column

print(y.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) #75% training data, 25% test data

mlp = MLPClassifier(hidden_layer_sizes=(5, 5), max_iter=1000) #two layers of 5 neurons, 1000 of backprop
mlp.fit(X_train, y_train.values.ravel()) #train the algo

predictions = mlp.predict(X_test) #make predictions on the xtest set

print(confusion_matrix(y_test,predictions)) #results are good
print(classification_report(y_test,predictions))


#Only mis-classifies 7 out of 3 (see confusion matrix).

   1  2  3         5         6         8         9  10  11  12  ...  146  147  \
0  0  0  0  0.000066  0.000066  0.009150  0.009150   0   0   0  ...  0.0    0   
1  0  0  0  0.000014  0.000014  0.000000  0.000000   0   0   0  ...  0.0    0   
2  0  0  0  0.035528  0.035528  0.070588  0.070588   0   0   0  ...  0.0    0   
3  0  0  0  0.005128  0.005128  0.094771  0.094771   0   0   0  ...  0.0    0   
4  0  0  0  0.035116  0.035116  0.070588  0.070588   0   0   0  ...  0.0    0   

   148  149  150  151  152  153  154  155  
0    0    0    0    0    0    0  0.0    0  
1    0    0    0    0    0    0  0.0    0  
2    0    0    0    0    0    0  0.0    0  
3    0    0    0    0    0    0  0.0    0  
4    0    0    0    0    0    0  0.0    0  

[5 rows x 153 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97044 entries, 0 to 97043
Columns: 153 entries, 1 to 155
dtypes: float64(48), int64(105)
memory usage: 113.3 MB
None
             1        2        3             5            

In [3]:
#save model using pickle
from pickle import dump

#save the model
modelname = 'my_model.sav'
dump(mlp, open(modelname,'wb'))