In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import f1_score, make_scorer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from collections import Counter
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
train_data = pd.read_csv("train.csv", usecols=["class", "viewCount", "commentCount", "likeCount", "dislikeCount", "title", "description"]) 
test_data = pd.read_csv("test_1.csv", usecols=["ID","viewCount", "commentCount", "likeCount", "dislikeCount", "title", "description"])

In [4]:
titlesTrain = train_data["title"]

uppercase = []
lowercase = []
exclamations = []

for i in titlesTrain:
    isupper = sum(1 for c in i if c.isupper())
    islower = sum(1 for c in i if c.islower())
    isexclamation = sum(1 for c in i if c == "!")
    uppercase.append(isupper)
    lowercase.append(islower)
    exclamations.append(isexclamation)
    
train_data["uppercaseTitle"] = uppercase
train_data["lowercaseTitle"] = lowercase
train_data["exclamationsTitle"] = exclamations

train_data["percentUpperTitle"] = train_data["uppercaseTitle"] / (train_data["uppercaseTitle"] + train_data["lowercaseTitle"])

In [5]:
titlesTest = test_data["title"]

uppercase = []
lowercase = []
exclamations = []
for i in titlesTest:
    isupper = sum(1 for c in i if c.isupper())
    islower = sum(1 for c in i if c.islower())
    isexclamation = sum(1 for c in i if c == "!")
    uppercase.append(isupper)
    lowercase.append(islower)
    exclamations.append(isexclamation)
    
test_data["uppercaseTitle"] = uppercase
test_data["lowercaseTitle"] = lowercase
test_data["exclamationsTitle"] = exclamations

test_data["percentUpperTitle"] = test_data["uppercaseTitle"] / (test_data["uppercaseTitle"] + test_data["lowercaseTitle"])

In [6]:
descriptionsTrain = train_data["description"]

uppercase = []
lowercase = []
exclamations = []

for i in descriptionsTrain:
    isupper = sum(1 for c in i if c.isupper())
    islower = sum(1 for c in i if c.islower())
    isexclamation = sum(1 for c in i if c == "!")
    uppercase.append(isupper)
    lowercase.append(islower)
    exclamations.append(isexclamation)
    
train_data["uppercaseDescription"] = uppercase
train_data["lowercaseDescription"] = lowercase
train_data["exclamationsDescription"] = exclamations

train_data["percentUpperDescription"] = train_data["uppercaseDescription"] / (train_data["uppercaseDescription"] + train_data["lowercaseDescription"])

In [7]:
descriptionsTest = test_data["description"]

uppercase = []
lowercase = []
exclamations = []

for i in descriptionsTest:
    isupper = sum(1 for c in i if c.isupper())
    islower = sum(1 for c in i if c.islower())
    isexclamation = sum(1 for c in i if c == "!")
    uppercase.append(isupper)
    lowercase.append(islower)
    exclamations.append(isexclamation)
    
test_data["uppercaseDescription"] = uppercase
test_data["lowercaseDescription"] = lowercase
test_data["exclamationsDescription"] = exclamations

test_data["percentUpperDescription"] = test_data["uppercaseDescription"] / (test_data["uppercaseDescription"] + test_data["lowercaseDescription"])

In [8]:
Y_train = train_data["class"]

X_train = train_data[["viewCount","likeCount","dislikeCount","commentCount","percentUpperTitle","exclamationsTitle","percentUpperDescription"]]

X_test = test_data[["viewCount","likeCount","dislikeCount","commentCount","percentUpperTitle","exclamationsTitle","percentUpperDescription"]]

In [9]:
scaler = StandardScaler()
scaler.fit(X_train)
train_transform = scaler.transform(X_train)
X_train = pd.DataFrame(train_transform)
test_transform = scaler.transform(X_test)
X_test = pd.DataFrame(test_transform)

In [10]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6
0,1.878598,4.063041,0.2356,2.145346,2.038774,-0.428953,-0.310951
1,-0.422865,-0.295712,-0.197023,-0.279667,-0.59998,-0.428953,0.008567
2,-0.201445,-0.12695,-0.164887,-0.126428,2.038774,1.217568,2.463338
3,-0.409763,-0.286247,-0.20148,-0.274348,-0.674904,-0.428953,-0.500957
4,-0.378688,-0.289823,-0.19585,-0.261197,-0.68836,-0.428953,-0.568155


In [11]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6
0,-0.393787,-0.283026,-0.18897,-0.270597,-0.400126,-0.428953,-1.161442
1,-0.170103,-0.172898,-0.158632,-0.263052,-0.490859,-0.428953,-0.605232
2,-0.419572,-0.294472,-0.200229,-0.279172,-0.605322,-0.428953,-0.907562
3,-0.081554,-0.084651,-0.134002,-0.143372,-0.304857,1.217568,-0.0452
4,0.64382,-0.019942,-0.086854,0.024006,-0.500159,1.217568,-0.088318


In [12]:
svc = SVC()
svc.fit(X_train, Y_train)
model_pred = svc.predict(X_train)

svc_scores = cross_val_score(svc, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(svc_scores)
print('cv_scores mean:{}'.format(np.mean(svc_scores)))

[0.86311239 0.86004351 0.85610465 0.8519329  0.87168459]
cv_scores mean:0.8605756072787042


In [13]:
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
model_pred = dt.predict(X_train)

dt_scores = cross_val_score(dt, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(dt_scores)
print('cv_scores mean:{}'.format(np.mean(dt_scores)))

[0.89393939 0.8924581  0.88235294 0.89728959 0.88951049]
cv_scores mean:0.8911101022980583


In [14]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)
model_pred = lr.predict(X_train)

lr_scores = cross_val_score(lr, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(lr_scores)
print('cv_scores mean:{}'.format(np.mean(lr_scores)))

[0.76494024 0.75493421 0.75666936 0.73158756 0.77201258]
cv_scores mean:0.7560287901838825


In [15]:
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
model_pred = knn.predict(X_train)

knn_scores = cross_val_score(knn, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(knn_scores)
print('cv_scores mean:{}'.format(np.mean(knn_scores)))

[0.90587403 0.90092659 0.88809694 0.9002849  0.90459364]
cv_scores mean:0.8999552175560759


In [16]:
nb = GaussianNB()
nb.fit(X_train, Y_train)
model_pred = nb.predict(X_train)

nb_scores = cross_val_score(nb, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(nb_scores)
print('cv_scores mean:{}'.format(np.mean(nb_scores)))

[0.71573604 0.68556244 0.70940171 0.69595177 0.71332209]
cv_scores mean:0.7039948102867447


In [17]:
knn2 = KNeighborsClassifier()

In [18]:
hyperparameters = dict()

hyperparameters["n_neighbors"] = list(range(1,30))
hyperparameters["leaf_size"] = list(range(10,50))

In [19]:
search = RandomizedSearchCV(knn, hyperparameters, scoring='f1', cv=5)

In [20]:
result = search.fit(X_train, Y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.9005458599827774
Best Hyperparameters: {'n_neighbors': 7, 'leaf_size': 26}


In [21]:
n_neighbors = result.best_params_['n_neighbors']
leaf_size = result.best_params_['leaf_size']

In [22]:
knn_final = KNeighborsClassifier(n_neighbors = n_neighbors, leaf_size = leaf_size)
knn_final.fit(X_train, Y_train)
model_pred = knn_final.predict(X_train)
model_f1 = f1_score(Y_train, model_pred)

model_f1

0.922028121005539

In [23]:
model_pred_df = pd.DataFrame(data=model_pred)
model_pred_df.describe()

Unnamed: 0,0
count,7105
unique,2
top,False
freq,3612


In [24]:
Y_pred = knn_final.predict(X_test)
Y_pred_df = pd.DataFrame(data=Y_pred)
test_data["class"] = Y_pred_df[0]

In [25]:
Y_pred_df

Unnamed: 0,0
0,True
1,False
2,True
3,False
4,False
5,False
6,True
7,False
8,False
9,False


In [26]:
result = test_data[["ID","class"]]
result.to_csv("resultKNN.csv", index=False)
result.head()

Unnamed: 0,ID,class
0,oRB8lJynqBA,True
1,of-UPoEnw_w,False
2,d28cz00HHto,True
3,Tftg_LnwTW0,False
4,qR0mkm65Whk,False


In [27]:
result.describe()

Unnamed: 0,ID,class
count,646,646
unique,646,2
top,4224hp9wAl4,False
freq,1,330
