In [10]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import f1_score, make_scorer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from collections import Counter
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [11]:
train_data = pd.read_csv("train.csv", usecols=["class", "viewCount", "commentCount", "likeCount", "dislikeCount"]) 
test_data = pd.read_csv("test_1.csv", usecols=["ID","viewCount", "commentCount", "likeCount", "dislikeCount"])

Y_train = train_data["class"]

X_train = train_data.drop("class", axis=1)

X_test = test_data.drop("ID", axis=1)

#X_train = X_train.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
#X_test = X_test.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

In [12]:
Y_train.describe()

count      7105
unique        2
top       False
freq       3557
Name: class, dtype: object

In [13]:
train_data.head()

Unnamed: 0,viewCount,likeCount,dislikeCount,commentCount,class
0,10509398.0,945921.0,5614.0,58844,True
1,4829.0,57.0,81.0,22,False
2,1015456.0,36679.0,492.0,3739,True
3,64629.0,2111.0,24.0,151,False
4,206468.0,1335.0,96.0,470,False


In [14]:
test_data.head()

Unnamed: 0,ID,viewCount,likeCount,dislikeCount,commentCount
0,oRB8lJynqBA,137551.0,2810.0,184.0,242
1,of-UPoEnw_w,1158511.0,26708.0,572.0,425
2,d28cz00HHto,19861.0,326.0,40.0,34
3,Tftg_LnwTW0,1562674.0,45858.0,887.0,3328
4,qR0mkm65Whk,4873500.0,59900.0,1490.0,7388


In [15]:
X_test.head()

Unnamed: 0,viewCount,likeCount,dislikeCount,commentCount
0,137551.0,2810.0,184.0,242
1,1158511.0,26708.0,572.0,425
2,19861.0,326.0,40.0,34
3,1562674.0,45858.0,887.0,3328
4,4873500.0,59900.0,1490.0,7388


In [79]:
scaler = StandardScaler()
scaler.fit(X_train)
train_transform = scaler.transform(X_train)
X_train = pd.DataFrame(train_transform)
test_transform = scaler.transform(X_test)
X_test = pd.DataFrame(test_transform)

In [80]:
X_train.head()

Unnamed: 0,0,1,2,3
0,1.878598,4.063041,0.2356,2.145346
1,-0.422865,-0.295712,-0.197023,-0.279667
2,-0.201445,-0.12695,-0.164887,-0.126428
3,-0.409763,-0.286247,-0.20148,-0.274348
4,-0.378688,-0.289823,-0.19585,-0.261197


In [81]:
X_test.head()

Unnamed: 0,0,1,2,3
0,-0.565513,-0.536548,-0.387235,-0.478988
1,-0.223277,-0.279967,-0.309646,-0.462042
2,-0.604964,-0.563218,-0.416031,-0.498249
3,-0.087798,-0.074362,-0.246655,-0.193222
4,1.022024,0.0764,-0.126072,0.182737


In [82]:
svc = SVC()
svc.fit(X_train, Y_train)
model_pred = svc.predict(X_train)

svc_scores = cross_val_score(svc, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(svc_scores)
print('cv_scores mean:{}'.format(np.mean(svc_scores)))

[0.52422145 0.56458512 0.60204082 0.52867384 0.55017301]
cv_scores mean:0.5539388461206235


In [83]:
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
model_pred = dt.predict(X_train)

dt_scores = cross_val_score(dt, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(dt_scores)
print('cv_scores mean:{}'.format(np.mean(dt_scores)))

[0.69957687 0.71398155 0.71700776 0.71988796 0.72384937]
cv_scores mean:0.7148607012944526


In [84]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)
model_pred = lr.predict(X_train)

lr_scores = cross_val_score(lr, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(lr_scores)
print('cv_scores mean:{}'.format(np.mean(lr_scores)))

[0.46828358 0.5018315  0.53418414 0.49953575 0.51239669]
cv_scores mean:0.5032463328284499


In [85]:
nb = GaussianNB()
nb.fit(X_train, Y_train)
model_pred = nb.predict(X_train)

nb_scores = cross_val_score(nb, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(nb_scores)
print('cv_scores mean:{}'.format(np.mean(nb_scores)))

[0.23095238 0.25059102 0.27610209 0.25977011 0.30665163]
cv_scores mean:0.2648134470668429


In [104]:
dt2 = DecisionTreeClassifier()
#cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

In [105]:
hyperparameters = dict()

hyperparameters["min_samples_split"] = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
hyperparameters["max_depth"] = list(range(1,30))

In [106]:
search = RandomizedSearchCV(dt2, hyperparameters, scoring='f1', cv=5)

In [107]:
result = search.fit(X_train, Y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.6727010542511725
Best Hyperparameters: {'min_samples_split': 0.3, 'max_depth': 9}


In [108]:
depth = result.best_params_['max_depth']
min_samples = result.best_params_['min_samples_split']

In [109]:
dt_final = DecisionTreeClassifier(max_depth = depth, min_samples_split = min_samples)
dt_final.fit(X_train, Y_train)
model_pred = dt_final.predict(X_train)
model_f1 = f1_score(Y_train, model_pred)

model_f1

0.6902415980564179

In [110]:
model_pred

array([ True, False, False, ...,  True,  True, False])

In [111]:
model_pred_df = pd.DataFrame(data=model_pred)
model_pred_df.describe()

Unnamed: 0,0
count,7105
unique,2
top,True
freq,3861


In [112]:
Y_pred = dt_final.predict(X_test)
Y_pred_df = pd.DataFrame(data=Y_pred)
test_data["class"] = Y_pred_df[0]

In [113]:
result = test_data[["ID","class"]]
result.to_csv("resultDT.csv", index=False)
result.head()

Unnamed: 0,ID,class
0,oRB8lJynqBA,False
1,of-UPoEnw_w,False
2,d28cz00HHto,False
3,Tftg_LnwTW0,False
4,qR0mkm65Whk,False


In [114]:
result.describe()

Unnamed: 0,ID,class
count,646,646
unique,646,2
top,DxBqgmxdgu8,False
freq,1,463
