In [67]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import f1_score, make_scorer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from collections import Counter
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [68]:
train_data = pd.read_csv("train.csv", usecols=["class", "viewCount", "commentCount", "likeCount", "dislikeCount", "title", "description"]) 
test_data = pd.read_csv("test_1.csv", usecols=["ID","viewCount", "commentCount", "likeCount", "dislikeCount", "title", "description"])

In [69]:
titlesTrain = train_data["title"]

uppercase = []
lowercase = []
exclamations = []

for i in titlesTrain:
    isupper = sum(1 for c in i if c.isupper())
    islower = sum(1 for c in i if c.islower())
    isexclamation = sum(1 for c in i if c == "!")
    uppercase.append(isupper)
    lowercase.append(islower)
    exclamations.append(isexclamation)
    
train_data["uppercaseTitle"] = uppercase
train_data["lowercaseTitle"] = lowercase
train_data["exclamationsTitle"] = exclamations

train_data["percentUpperTitle"] = train_data["uppercaseTitle"] / (train_data["uppercaseTitle"] + train_data["lowercaseTitle"])

In [70]:
titlesTest = test_data["title"]

uppercase = []
lowercase = []
exclamations = []
for i in titlesTest:
    isupper = sum(1 for c in i if c.isupper())
    islower = sum(1 for c in i if c.islower())
    isexclamation = sum(1 for c in i if c == "!")
    uppercase.append(isupper)
    lowercase.append(islower)
    exclamations.append(isexclamation)
    
test_data["uppercaseTitle"] = uppercase
test_data["lowercaseTitle"] = lowercase
test_data["exclamationsTitle"] = exclamations

test_data["percentUpperTitle"] = test_data["uppercaseTitle"] / (test_data["uppercaseTitle"] + test_data["lowercaseTitle"])

In [71]:
descriptionsTrain = train_data["description"]

uppercase = []
lowercase = []
exclamations = []

for i in descriptionsTrain:
    isupper = sum(1 for c in i if c.isupper())
    islower = sum(1 for c in i if c.islower())
    isexclamation = sum(1 for c in i if c == "!")
    uppercase.append(isupper)
    lowercase.append(islower)
    exclamations.append(isexclamation)
    
train_data["uppercaseDescription"] = uppercase
train_data["lowercaseDescription"] = lowercase
train_data["exclamationsDescription"] = exclamations

train_data["percentUpperDescription"] = train_data["uppercaseDescription"] / (train_data["uppercaseDescription"] + train_data["lowercaseDescription"])

In [72]:
descriptionsTrain.describe()

count                                                  7105
unique                                                 6309
top       Click the link to subscribe: http://bit.ly/Fac...
freq                                                    125
Name: description, dtype: object

In [73]:
descriptionsTest = test_data["description"]

uppercase = []
lowercase = []
exclamations = []

for i in descriptionsTest:
    isupper = sum(1 for c in i if c.isupper())
    islower = sum(1 for c in i if c.islower())
    isexclamation = sum(1 for c in i if c == "!")
    uppercase.append(isupper)
    lowercase.append(islower)
    exclamations.append(isexclamation)
    
test_data["uppercaseDescription"] = uppercase
test_data["lowercaseDescription"] = lowercase
test_data["exclamationsDescription"] = exclamations

test_data["percentUpperDescription"] = test_data["uppercaseDescription"] / (test_data["uppercaseDescription"] + test_data["lowercaseDescription"])

In [74]:
train_data.head()

Unnamed: 0,title,description,viewCount,likeCount,dislikeCount,commentCount,class,uppercaseTitle,lowercaseTitle,exclamationsTitle,percentUpperTitle,uppercaseDescription,lowercaseDescription,exclamationsDescription,percentUpperDescription
0,MARZIA HAS LIGMA LWIAY #0044,Become Sponsor: https://youtube.com/pewdiepie/...,10509398.0,945921.0,5614.0,58844,True,19,0,0,1.0,109,777,2,0.123025
1,This Slinky Montage Is Bizarrely Satisfying to...,Happy National Slinky Day! We got our hands on...,4829.0,57.0,81.0,22,False,7,38,0,0.155556,78,416,1,0.157895
2,MAKING HER DREAM COME TRUE! (MAKE A WISH),It was so nice to meet you Trinity 😊\nGET NEW ...,1015456.0,36679.0,492.0,3739,True,31,0,1,1.0,175,236,4,0.425791
3,Science Journalism: Crash Course Statistics #11,We’ve talked a lot in this series about how of...,64629.0,2111.0,24.0,151,False,5,33,0,0.131579,143,1255,2,0.102289
4,"Michelin and General Motors Unveil Airless, Pu...",Michelin and General Motors are betting on nea...,206468.0,1335.0,96.0,470,False,7,48,0,0.127273,64,610,0,0.094955


In [75]:
Y_train = train_data["class"]

X_train = train_data[["viewCount","likeCount","dislikeCount","commentCount","percentUpperTitle","exclamationsTitle","percentUpperDescription"]]

X_test = test_data[["viewCount","likeCount","dislikeCount","commentCount","percentUpperTitle","exclamationsTitle","percentUpperDescription"]]

In [76]:
Y_train.describe()

count      7105
unique        2
top       False
freq       3557
Name: class, dtype: object

In [77]:
X_train.head()

Unnamed: 0,viewCount,likeCount,dislikeCount,commentCount,percentUpperTitle,exclamationsTitle,percentUpperDescription
0,10509398.0,945921.0,5614.0,58844,1.0,0,0.123025
1,4829.0,57.0,81.0,22,0.155556,0,0.157895
2,1015456.0,36679.0,492.0,3739,1.0,1,0.425791
3,64629.0,2111.0,24.0,151,0.131579,0,0.102289
4,206468.0,1335.0,96.0,470,0.127273,0,0.094955


In [78]:
test_data.head()

Unnamed: 0,ID,title,description,viewCount,likeCount,dislikeCount,commentCount,uppercaseTitle,lowercaseTitle,exclamationsTitle,percentUpperTitle,uppercaseDescription,lowercaseDescription,exclamationsDescription,percentUpperDescription
0,oRB8lJynqBA,Fisherman Catches Never-Before-Seen Sea Creatu...,Fishing results in some lucky catches and some...,137551.0,2810.0,184.0,242,18,64,0,0.219512,29,931,1,0.030208
1,of-UPoEnw_w,Card Throwing Speed Test,"In this bonus footage, Dan takes on Rick Smith...",1158511.0,26708.0,572.0,425,4,17,0,0.190476,11,110,0,0.090909
2,d28cz00HHto,Nintendo Labo for Switch | Circuit Breaker Live,"On the season finale of Circuit Breaker, we ar...",19861.0,326.0,40.0,34,6,33,0,0.153846,30,488,0,0.057915
3,Tftg_LnwTW0,Gym Logic! (These Make No Sense),Send Video Ideas: ReactionTimeVideos@gmail.com...,1562674.0,45858.0,887.0,3328,6,18,1,0.25,45,251,0,0.152027
4,qR0mkm65Whk,Worst Things That Happened At Walmart!,Send Video Ideas: ReactionTimeVideos@gmail.com...,4873500.0,59900.0,1490.0,7388,6,26,1,0.1875,33,191,0,0.147321


In [79]:
X_test.head()

Unnamed: 0,viewCount,likeCount,dislikeCount,commentCount,percentUpperTitle,exclamationsTitle,percentUpperDescription
0,137551.0,2810.0,184.0,242,0.219512,0,0.030208
1,1158511.0,26708.0,572.0,425,0.190476,0,0.090909
2,19861.0,326.0,40.0,34,0.153846,0,0.057915
3,1562674.0,45858.0,887.0,3328,0.25,1,0.152027
4,4873500.0,59900.0,1490.0,7388,0.1875,1,0.147321


In [80]:
scaler = StandardScaler()
scaler.fit(X_train)
train_transform = scaler.transform(X_train)
X_train = pd.DataFrame(train_transform)
test_transform = scaler.transform(X_test)
X_test = pd.DataFrame(test_transform)

In [81]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6
0,1.878598,4.063041,0.2356,2.145346,2.038774,-0.428953,-0.310951
1,-0.422865,-0.295712,-0.197023,-0.279667,-0.59998,-0.428953,0.008567
2,-0.201445,-0.12695,-0.164887,-0.126428,2.038774,1.217568,2.463338
3,-0.409763,-0.286247,-0.20148,-0.274348,-0.674904,-0.428953,-0.500957
4,-0.378688,-0.289823,-0.19585,-0.261197,-0.68836,-0.428953,-0.568155


In [82]:
X_train.describe()

Unnamed: 0,0,1,2,3,4,5,6
count,7105.0,7105.0,7105.0,7105.0,7105.0,7105.0,7105.0
mean,1.6649440000000002e-17,-1.711822e-17,-3.341217e-17,3.437707e-18,1.288515e-16,-1.701196e-16,-1.371489e-16
std,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007
min,-0.4236695,-0.2958871,-0.2033565,-0.2801613,-1.086067,-0.4289527,-1.438246
25%,-0.3879138,-0.2786984,-0.1908461,-0.2657321,-0.6396609,-0.4289527,-0.7555479
50%,-0.2643178,-0.208082,-0.1497965,-0.2179508,-0.450506,-0.4289527,-0.2717479
75%,0.02717043,-0.03977583,-0.05080845,-0.05255096,0.36203,-0.4289527,0.4322676
max,45.46,40.90645,62.73173,47.8364,2.038774,7.803649,5.873761


In [83]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6
0,-0.393787,-0.283026,-0.18897,-0.270597,-0.400126,-0.428953,-1.161442
1,-0.170103,-0.172898,-0.158632,-0.263052,-0.490859,-0.428953,-0.605232
2,-0.419572,-0.294472,-0.200229,-0.279172,-0.605322,-0.428953,-0.907562
3,-0.081554,-0.084651,-0.134002,-0.143372,-0.304857,1.217568,-0.0452
4,0.64382,-0.019942,-0.086854,0.024006,-0.500159,1.217568,-0.088318


In [84]:
X_test.describe()

Unnamed: 0,0,1,2,3,4,5,6
count,646.0,646.0,646.0,646.0,646.0,646.0,646.0
mean,-0.02417,-0.052734,-0.037559,-0.057349,0.052728,0.00944,9.5e-05
std,0.654102,0.429543,0.391307,0.44555,1.024601,0.992167,0.954082
min,-0.423425,-0.295882,-0.2032,-0.280038,-1.086067,-0.428953,-1.438246
25%,-0.367949,-0.269224,-0.186546,-0.260909,-0.623127,-0.428953,-0.751356
50%,-0.240144,-0.189195,-0.141782,-0.201357,-0.433544,-0.428953,-0.232727
75%,0.054204,-0.019904,-0.052959,-0.044275,0.462884,-0.428953,0.463221
max,7.603707,4.388907,4.350331,6.073429,2.038774,6.157128,3.679909


In [85]:
svc = SVC()
svc.fit(X_train, Y_train)
model_pred = svc.predict(X_train)

svc_scores = cross_val_score(svc, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(svc_scores)
print('cv_scores mean:{}'.format(np.mean(svc_scores)))

[0.86311239 0.86004351 0.85610465 0.8519329  0.87168459]
cv_scores mean:0.8605756072787042


In [86]:
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
model_pred = dt.predict(X_train)

dt_scores = cross_val_score(dt, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(dt_scores)
print('cv_scores mean:{}'.format(np.mean(dt_scores)))

[0.90076336 0.89308176 0.88563459 0.89477452 0.89263158]
cv_scores mean:0.8933771608235007


In [87]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)
model_pred = lr.predict(X_train)

lr_scores = cross_val_score(lr, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(lr_scores)
print('cv_scores mean:{}'.format(np.mean(lr_scores)))

[0.76494024 0.75493421 0.75666936 0.73158756 0.77201258]
cv_scores mean:0.7560287901838825


In [88]:
nb = GaussianNB()
nb.fit(X_train, Y_train)
model_pred = nb.predict(X_train)

nb_scores = cross_val_score(nb, X_train, Y_train, cv=5, scoring = make_scorer(f1_score))
print(nb_scores)
print('cv_scores mean:{}'.format(np.mean(nb_scores)))

[0.71573604 0.68556244 0.70940171 0.69595177 0.71332209]
cv_scores mean:0.7039948102867447


In [89]:
hyperparameters = dict()

hyperparameters["min_samples_leaf"] = list(range(1,50))
hyperparameters["max_depth"] = list(range(1,30))

In [90]:
search = RandomizedSearchCV(dt, hyperparameters, scoring='f1', cv=5)

In [91]:
result = search.fit(X_train, Y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.8844946203960633
Best Hyperparameters: {'min_samples_leaf': 16, 'max_depth': 11}


In [92]:
depth = result.best_params_['max_depth']
min_samples = result.best_params_['min_samples_leaf']

In [93]:
dt_final = DecisionTreeClassifier(max_depth = depth, min_samples_split = min_samples)
dt_final.fit(X_train, Y_train)
model_pred = dt_final.predict(X_train)
model_f1 = f1_score(Y_train, model_pred)

model_f1

0.9529361581117588

In [94]:
model_pred

array([ True, False,  True, ...,  True, False, False])

In [95]:
model_pred_df = pd.DataFrame(data=model_pred)
model_pred_df.describe()

Unnamed: 0,0
count,7105
unique,2
top,False
freq,3620


In [96]:
Y_pred = dt_final.predict(X_test)
Y_pred_df = pd.DataFrame(data=Y_pred)
test_data["class"] = Y_pred_df[0]

In [97]:
result = test_data[["ID","class"]]
result.to_csv("resultDT.csv", index=False)
result.head()

Unnamed: 0,ID,class
0,oRB8lJynqBA,True
1,of-UPoEnw_w,False
2,d28cz00HHto,False
3,Tftg_LnwTW0,False
4,qR0mkm65Whk,False


In [98]:
result.describe()

Unnamed: 0,ID,class
count,646,646
unique,646,2
top,bEzq53BG3g0,False
freq,1,332


In [116]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=100, base_estimator=DecisionTreeClassifier(max_depth = depth, min_samples_split = min_samples), learning_rate=0.5, random_state=0)

clf.fit(X_train,Y_train)

#Y_predicted = clf.predict(X_test)
#Y_pred_df = pd.DataFrame(data=Y_predicted)
#test_data["class"] = Y_pred_df[0]

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=16,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.5, n_estimators=100, random_state=0)

In [117]:
model_pred = clf.predict(X_train)
model_f1 = f1_score(Y_train, model_pred)

model_f1

1.0

In [118]:
result = test_data[["ID","class"]]
result.to_csv("resultDT.csv", index=False)
result.head()

Unnamed: 0,ID,class
0,oRB8lJynqBA,True
1,of-UPoEnw_w,False
2,d28cz00HHto,False
3,Tftg_LnwTW0,False
4,qR0mkm65Whk,False


In [119]:
result.describe()

Unnamed: 0,ID,class
count,646,646
unique,646,2
top,bEzq53BG3g0,True
freq,1,330
