In [196]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from collections import Counter
import math
import pandas as pd
import numpy as np

In [169]:
train_data = pd.read_csv("train.csv", usecols=["class", "viewCount", "commentCount", "likeCount", "dislikeCount"]) 
test_data = pd.read_csv("test_1.csv", usecols=["ID","viewCount", "commentCount", "likeCount", "dislikeCount"])

Y_train = train_data["class"]

X_train = train_data.drop("class", axis=1)

X_test = test_data.drop("ID", axis=1)

#X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)

X_train = X_train.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
#X_val = X_val.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
X_test = X_test.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

In [170]:
Y_train.describe()

count      7105
unique        2
top       False
freq       3557
Name: class, dtype: object

In [171]:
train_data.head()

Unnamed: 0,viewCount,likeCount,dislikeCount,commentCount,class
0,10509398.0,945921.0,5614.0,58844,True
1,4829.0,57.0,81.0,22,False
2,1015456.0,36679.0,492.0,3739,True
3,64629.0,2111.0,24.0,151,False
4,206468.0,1335.0,96.0,470,False


In [172]:
scaler = StandardScaler()
scaler.fit(X_train)
sc_transform = scaler.transform(X_train)
X_train = pd.DataFrame(sc_transform)

scaler.fit(X_val)
sc_val_transform = scaler.transform(X_val)
X_val = pd.DataFrame(sc_val_transform)

In [173]:
X_train.head()

Unnamed: 0,0,1,2,3
0,1.878598,4.063041,0.2356,2.145346
1,-0.422865,-0.295712,-0.197023,-0.279667
2,-0.201445,-0.12695,-0.164887,-0.126428
3,-0.409763,-0.286247,-0.20148,-0.274348
4,-0.378688,-0.289823,-0.19585,-0.261197


In [174]:
X_val.head()

Unnamed: 0,0,1,2,3
0,-0.413014,-0.271104,-0.241256,-0.257085
1,-0.261337,-0.203567,-0.156429,-0.254071
2,-0.001008,-0.019026,-0.016738,-0.038561
3,-0.294585,-0.174212,-0.102656,-0.085054
4,-0.415968,-0.274386,-0.242149,-0.260137


In [175]:
def f1(y_true, y_pred):
    return f1_score

def fit_and_evaluate(model):
    
    model.fit(X_train, Y_train)
    
    model_pred = model.predict(X_val)
    model_f1 = f1(Y_val, model_pred)
    
    return model_f1

In [176]:
svc = SVC()
#svc_f1 = fit_and_evaluate(svc)
svc.fit(X_train, Y_train)
model_pred = svc.predict(X_val)
model_f1 = f1_score(Y_val, model_pred)

print('Support Vector Classifier F1 score: %f' % model_f1)

Support Vector Classifier F1 score: 0.579216


In [177]:
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
model_pred = dt.predict(X_val)
model_f1 = f1_score(Y_val, model_pred)

print('Decision Tree Classifier F1 score: %f' % model_f1)

Decision Tree Classifier F1 score: 0.631949


In [178]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)
model_pred = lr.predict(X_val)
model_f1 = f1_score(Y_val, model_pred)

print('Logistic Regression F1 score: %f' % model_f1)

Logistic Regression F1 score: 0.522622


In [179]:
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
model_pred = knn.predict(X_val)
model_f1 = f1_score(Y_val, model_pred)

print('K Nearest Neighbors F1 score: %f' % model_f1)

K Nearest Neighbors F1 score: 0.676775


In [180]:
nb = GaussianNB()
nb.fit(X_train, Y_train)
model_pred = nb.predict(X_val)
model_f1 = f1_score(Y_val, model_pred)

print('Naive Bayes F1 score: %f' % model_f1)

Naive Bayes F1 score: 0.272076


In [197]:
knn_cv = KNeighborsClassifier()

In [199]:
cv_scores = cross_val_score(knn_cv, X_train, Y_train, cv=5)
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.74824191 0.7742616  0.75580577 0.75915493 0.73802817]
cv_scores mean:0.7550984770700094


In [219]:
neighbors = list(range(1,30))

hyperparameters = dict(n_neighbors=neighbors)

In [220]:
knn_2 = KNeighborsClassifier()

clf = GridSearchCV(knn_2, hyperparameters, cv=5)

best_model = clf.fit(X_train, Y_train)

In [221]:
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

Best leaf_size: 25
Best n_neighbors: 10


In [222]:
neighbors = best_model.best_estimator_.get_params()['n_neighbors']

In [223]:
knn_3 = KNeighborsClassifier(n_neighbors = neighbors)
knn_3.fit(X_train, Y_train)
model_pred = knn_3.predict(X_train)
model_f1 = f1_score(Y_train, model_pred)

print('Tuned KNN Model F1 score: %f' % model_f1)

Tuned KNN Model F1 score: 0.804928


In [224]:
Y_pred = knn_3.predict(X_test)

test_data["class"] = Y_pred
#test_data["class"] = test_data["class"].map(lambda x: "True" if x==1 else "False")
result = test_data[["ID","class"]]
result.to_csv("result.csv", index=False)
result.head()

Unnamed: 0,ID,class
0,oRB8lJynqBA,False
1,of-UPoEnw_w,False
2,d28cz00HHto,False
3,Tftg_LnwTW0,False
4,qR0mkm65Whk,False
