In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [22]:
ds = pd.read_csv('train.csv')
final_test = pd.read_csv('test.csv')

In [23]:
ds = ds.apply(LabelEncoder().fit_transform)
X = ds.loc[:, 'gender':'n15']
std = StandardScaler()
X = std.fit_transform(X)
y = ds['label']
trainX, testX, trainY, testY = train_test_split(X, y, test_size=.25, random_state=1927)

In [24]:
models = {}

# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

# Support Vector Machines
from sklearn.svm import LinearSVC
models['Support Vector Machines'] = LinearSVC()

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier()

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier()

In [25]:
accuracy, precision, recall, auc = {}, {}, {}, {}

for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(trainX, trainY)
    
    # Prediction 
    predictions = models[key].predict(testX)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = metrics.accuracy_score(predictions, testY)
    precision[key] = metrics.precision_score(predictions, testY)
    recall[key] = metrics.recall_score(predictions, testY)
    fpr, tpr, thresholds = metrics.roc_curve(predictions, testY)
    auc[key] = metrics.auc(fpr, tpr)

    print(key)

Logistic Regression




Support Vector Machines
Decision Trees
Random Forest
Naive Bayes
K-Nearest Neighbor


In [26]:
df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall', 'Auc'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()
df_model['Auc'] = auc.values()

df_model

Unnamed: 0,Accuracy,Precision,Recall,Auc
Logistic Regression,0.880667,0.481356,0.709114,0.805831
Support Vector Machines,0.878689,0.458475,0.710907,0.8049
Decision Trees,0.817822,0.484746,0.45614,0.675899
Random Forest,0.884762,0.466949,0.746612,0.823724
Naive Bayes,0.848609,0.638136,0.538627,0.731745
K-Nearest Neighbor,0.867109,0.477119,0.634724,0.767556


In [27]:
cl = LinearSVC(dual=False)
cl.fit(X, y)

In [28]:
ids = final_test['id']
final_test = final_test.apply(LabelEncoder().fit_transform)
final_x = final_test.loc[:, 'gender':'n15']
final_x = std.fit_transform(final_x)

In [29]:
pred = cl.predict(final_x)
#pred = (pred > .5)

In [30]:
df_ans = pd.DataFrame(columns=['id', 'label'])
df_ans['id'] = ids
df_ans['id'] = df_ans['id'].astype('object')
df_ans['label'] = pred

df_ans

Unnamed: 0,id,label
0,b'gAAAAABinOi328DZcweGB4_nOyHA3Dy6o1YKYKyf3COx...,1
1,b'gAAAAABinOikutEIBjkUXl9lYTg4RI6jc4NfiMUCcVsn...,0
2,b'gAAAAABinOjBM70jBXOroAlUSq5lNXMd_oP0PU7jLQE5...,0
3,b'gAAAAABinOimitAnqlgOcqnD_LeNL3WEbXNGvjd3QVPi...,0
4,b'gAAAAABinOi3W9p3Oka5MV_dc2TeorZUcIWOnnODSx7E...,0
...,...,...
85060,b'gAAAAABinOjbnJVk2-nOVQsYB9p4DK26fTLLik_UR2H0...,0
85061,b'gAAAAABinOi7ixyXrlKYlx8D9i0-TIPD5elP2k-vuekn...,0
85062,b'gAAAAABinOi31zWSlD0OMhbBd3_weh7Kq6aPeO4yYqns...,0
85063,b'gAAAAABinOjIe7jFVk9k7jiH8Y3rdpUHDTZG2T2isunp...,1


In [33]:
sum(pred)

9436

In [32]:
df_ans.to_csv('answer.csv',index=False)