# Ghouls, Goblins, and Ghosts
## Multiclass Classification Task

Falconi Nicasio

April 23 2019

In [1]:
# load data
import pandas as pd
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

In [2]:
from sklearn.preprocessing import LabelEncoder

# encode color feature
gle = LabelEncoder()
train_color_labels = gle.fit_transform(train_data['color'])
test_color_labels = gle.fit_transform(test_data['color'])
train_data['color_labels'] = train_color_labels
test_data['color_labels'] = test_color_labels

In [3]:
# take target out of training set
Y = train_data['type']
train_data = train_data.drop(['type', 'id', 'color'], axis=1)
test_data = test_data.drop(['id', 'color'], axis=1)

In [4]:
train_data.describe()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color_labels
count,371.0,371.0,371.0,371.0,371.0
mean,0.43416,0.506848,0.529114,0.471392,3.404313
std,0.132833,0.146358,0.169902,0.176129,1.615259
min,0.061032,0.095687,0.1346,0.009402,0.0
25%,0.340006,0.414812,0.407428,0.348002,3.0
50%,0.434891,0.501552,0.538642,0.466372,3.0
75%,0.517223,0.603977,0.647244,0.60061,5.0
max,0.817001,0.932466,1.0,0.935721,5.0


In [5]:
train_data.corr()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color_labels
bone_length,1.0,-0.041716,0.353881,0.381675,-0.03383
rotting_flesh,-0.041716,1.0,-0.220353,-0.132051,-0.041714
hair_length,0.353881,-0.220353,1.0,0.474835,0.009093
has_soul,0.381675,-0.132051,0.474835,1.0,-0.025546
color_labels,-0.03383,-0.041714,0.009093,-0.025546,1.0


In [6]:
#standardize values
train_data = (train_data - train_data.mean()) / train_data.std()
test_data = (test_data - test_data.mean()) / test_data.std()

Cross validate paramaters for One vs Rest and One vs One

In [7]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

model_to_set = OneVsRestClassifier(LogisticRegression(penalty = 'l2', max_iter = 1000))

parameters = {
    "estimator__C": [0.1, 0.5, 0.7, 1.0, 1.2, 2, 5, 10, 20, 100],
    "estimator__solver": ['newton-cg', 'lbfgs', 'sag'],
    "estimator__multi_class" : ['multinomial', 'ovr']
}

model_tunning = GridSearchCV(model_to_set, param_grid=parameters,
                             cv = 2)

model_tunning.fit(train_data, Y)

print(model_tunning.best_score_)
print(model_tunning.best_params_)

0.738544474393531
{'estimator__C': 0.1, 'estimator__multi_class': 'ovr', 'estimator__solver': 'newton-cg'}


In [10]:
ovr = OneVsRestClassifier(LogisticRegression(penalty = "l2", C = 0.1, multi_class = 'ovr', solver = 'newton-cg'))
ovr_fitted = ovr.fit(train_data, Y)
res = ovr_fitted.predict(test_data)

In [11]:
# save predictions
sample_data = pd.read_csv("../data/sample_submission.csv")
sample_data['type'] = res
sample_data.to_csv('../prediction.csv', index = False)