In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [3]:
# !pip install pandas_ml
from sklearn import metrics
from pandas_ml import ConfusionMatrix
from matplotlib import pyplot as plt
import itertools

In [4]:
dataframe = pd.read_csv("C:/Users/Administrator/Downloads/v5/datasets-v5/task-1/task1.train.txt",delimiter="\t")

In [5]:
dataframe.to_csv("C:/Users/Administrator/Downloads/v5/new_tasks.train.csv", encoding='utf-8', index=False)

In [6]:
df = pd.read_csv("C:/Users/Administrator/Downloads/v5/new_tasks.train.csv", header=None)

In [7]:
df.head(1)

Unnamed: 0,0,1,2
0,"Et tu, Rhody? A recent editorial in the Provi...",727600136,non-propaganda


In [8]:
df.columns =['Title_Content','article_id','label']

In [9]:
df.head(5)

Unnamed: 0,Title_Content,article_id,label
0,"Et tu, Rhody? A recent editorial in the Provi...",727600136,non-propaganda
1,A recent post in The Farmington Mirror — our t...,731714618,non-propaganda
2,"President Donald Trump, as he often does while...",731714635,non-propaganda
3,"February is Black History Month, and nothing l...",728627182,non-propaganda
4,"The snow was so heavy, whipped up by gusting w...",728627443,non-propaganda


In [10]:
df['label'].value_counts()

non-propaganda    31965
propaganda         4021
Name: label, dtype: int64

In [11]:
df.shape

(35986, 3)

In [12]:
df.isnull().sum()

Title_Content    0
article_id       0
label            0
dtype: int64

In [13]:
df.isna().sum()

Title_Content    0
article_id       0
label            0
dtype: int64

In [14]:
df.shape

(35986, 3)

In [15]:
df = df.set_index("article_id")

In [16]:
df.head(3)

Unnamed: 0_level_0,Title_Content,label
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1
727600136,"Et tu, Rhody? A recent editorial in the Provi...",non-propaganda
731714618,A recent post in The Farmington Mirror — our t...,non-propaganda
731714635,"President Donald Trump, as he often does while...",non-propaganda


In [17]:
y=df['label']
df1 = df.drop('label',axis=1)
df1.head(2)

Unnamed: 0_level_0,Title_Content
article_id,Unnamed: 1_level_1
727600136,"Et tu, Rhody? A recent editorial in the Provi..."
731714618,A recent post in The Farmington Mirror — our t...


In [18]:
df.isnull().sum()

Title_Content    0
label            0
dtype: int64

In [19]:
X_train,X_test,y_train,y_test = train_test_split(df1['Title_Content'],y,test_size =0.3, random_state= 100)

In [20]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8)

In [21]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_train = tfidf_vectorizer.transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [22]:
linear_clf = PassiveAggressiveClassifier(n_iter=50)

linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score_pa_tfidf = metrics.accuracy_score(y_test, pred)
score_pa_tfidf = round(score_pa_tfidf,3)
print("accuracy:   %0.3f" % score_pa_tfidf)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_pa = (2*P*R)/(P+R)
print(F1_pa)



accuracy:   0.960
0.8020257826887661


In [23]:
lr = LogisticRegression()
lr.fit(tfidf_train, y_train)
pred = lr.predict(tfidf_test)
score_lr_tfidf = metrics.accuracy_score(y_test, pred)
score_lr_tfidf = round(score_lr_tfidf,3)
print("accuracy:   %0.3f" % score_lr_tfidf)
# cm_lr_tfidf = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda'])
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_lr = (2*P*R)/(P+R)
print(F1_lr)

accuracy:   0.943
0.6644880174291938


In [24]:
clf = MultinomialNB(alpha=0.01)

clf.fit(tfidf_train, y_train)

pred = clf.predict(tfidf_test)
score_nb_tfidf = metrics.accuracy_score(y_test, pred)
score_nb_tfidf = round(score_nb_tfidf,3)
print("accuracy:   %0.3f" % score_nb_tfidf)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_nb = (2*P*R)/(P+R)
print(F1_nb)

accuracy:   0.946
0.7298665439484584


In [25]:
rf = RandomForestClassifier(random_state=100)
rf.fit(tfidf_train, y_train)
pred = rf.predict(tfidf_test)
score_rf_tfidf = metrics.accuracy_score(y_test, pred)
score_rf_tfidf = round(score_rf_tfidf,3)
print("accuracy:   %0.3f" % score_rf_tfidf)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_rf = (2*P*R)/(P+R)
print(F1_rf)

accuracy:   0.912
0.34396671289875175


In [26]:
dt = DecisionTreeClassifier(random_state=100)
dt.fit(tfidf_train, y_train)
pred = dt.predict(tfidf_test)
score_dt_tfidf = metrics.accuracy_score(y_test, pred)
score_dt_tfidf = round(score_dt_tfidf,3)
print("accuracy:   %0.3f" % score_dt_tfidf)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_dt = (2*P*R)/(P+R)
print(F1_dt)

accuracy:   0.922
0.641860465116279


In [27]:
ada = AdaBoostClassifier()
ada.fit(tfidf_train, y_train)
pred = ada.predict(tfidf_test)
score_ada_tfidf = metrics.accuracy_score(y_test, pred)
score_ada_tfidf = round(score_ada_tfidf,3)
print("accuracy:   %0.3f" % score_ada_tfidf)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_ada = (2*P*R)/(P+R)
print(F1_ada)

accuracy:   0.939
0.678082191780822


In [28]:
mlp = MLPClassifier(hidden_layer_sizes=(5,5,4)) 
mlp.fit(tfidf_train, y_train)
pred = mlp.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
score_mlp_tfidf1=round(score,3)
#print(score_knn_tfidf)
print("accuracy:   %0.3f" % score)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_mlp = (2*P*R)/(P+R)
print(F1_mlp)

accuracy:   0.961
0.8001885014137606


In [29]:
knn = KNeighborsClassifier()
knn.fit(tfidf_train, y_train)
pred = knn.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
score_knn_tfidf=round(score,3)
#print(score_knn_tfidf)
print("accuracy:   %0.3f" % score)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_knn = (2*P*R)/(P+R)
print(F1_knn)

accuracy:   0.925
0.5534246575342466


In [None]:
# from sklearn.preprocessing import normalize
# tfidf_train=normalize(tfidf_train)
# tfidf_test=normalize(tfidf_test)
svm1 = SVC(C=1.0,gamma=0.1,kernel='rbf',random_state=50)
svm1.fit(tfidf_train, y_train)
pred = svm1.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
score_svm_tfidf = round(score,3)
print("accuracy:   %0.3f" % score)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_svm = (2*P*R)/(P+R)
print(F1_svm)

In [67]:
# acc = {'Passive Aggresive':score_pa_tfidf,'Logistic':score_lr_tfidf,'AdaBoost':score_ada_tfidf,'NB':score_nb_tfidf,
#        'RF':score_rf_tfidf,'DT':score_dt_tfidf,'KNN':score_knn_tfidf,"MLP_NN":score_mlp_tfidf1,"SVM":score_svm_tfidf}
# acc1 = pd.DataFrame([acc])
# acc1.index=['Model Accuracy']
# acc1

Unnamed: 0,AdaBoost,DT,KNN,Logistic,MLP_NN,NB,Passive Aggresive,RF,SVM
Model Accuracy,0.939,0.922,0.925,0.943,0.961,0.891,0.961,0.912,0.891


In [110]:
acc2 = {'Passive Aggresive':score_pa_tfidf,'Logistic':score_lr_tfidf,'AdaBoost':score_ada_tfidf,'NB':score_nb_tfidf,
       'RF':score_rf_tfidf,'DT':score_dt_tfidf,'KNN':score_knn_tfidf,"MLP_NN":score_mlp_tfidf1,"SVM":score_svm_tfidf}
F1_tfidf = {'Passive Aggresive':F1_pa,'Logistic':F1_lr,'AdaBoost':F1_ada,'NB':F1_nb,
      'RF':F1_rf,'DT':F1_dt,'KNN':F1_knn,"MLP_NN":F1_mlp,"SVM":F1_svm}
acc_tfidf = pd.DataFrame([acc2,F1_tfidf])
acc_tfidf.index=['Model Accuracy_tfidf','F1 Score_tfidf']
acc_tfidf

Unnamed: 0,AdaBoost,DT,KNN,Logistic,MLP_NN,NB,Passive Aggresive,RF,SVM
Model Accuracy_tfidf,0.939,0.922,0.925,0.943,0.961,0.946,0.96,0.912,0.936
F1 Score_tfidf,0.678082,0.64186,0.553425,0.664488,0.811738,0.729867,0.79963,0.343967,0.589001


In [None]:
# tn,fp,fn,tp = confusion_matrix(test_dy,test_pred).ravel() 
# sensitivity = (tp/(tp+fn))*100 # recall
# specificity = (tn/(tn+fp))*100
# accuracy = ((tp+tn)/(tp+tn+fp+fn))*100

# tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
# R = (tp/(tp+fn))*100 # recall
# P = (tp/(tp+fp))*100 # precision
# F1 = (2*P*R)/(P+R)
# print(tn,fp,fn,tp,sensitivity,specificity, F1)

### Building Count Vectorizer

In [44]:
# Initialize the `count_vectorizer` 
count_vectorizer = CountVectorizer(stop_words='english')

In [45]:
count_train = count_vectorizer.fit(X_train)
count_train = count_vectorizer.transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [78]:
## Fitting Naive Baye's Classifier for Multinomial Model
clf = MultinomialNB(alpha=0.01)
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
score_cnt_nb = metrics.accuracy_score(y_test, pred)
score_cnt_nb = round(score_cnt_nb,3)
print("accuracy:   %0.3f" % score_cnt_nb)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_nb_cnt = (2*P*R)/(P+R)
print(F1_nb_cnt)

accuracy:   0.925
0.6986964618249535


In [81]:
## Fitting Passive Aggresive Classifier Model

linear_clf = PassiveAggressiveClassifier(n_iter=50)

linear_clf.fit(count_train, y_train)
pred = linear_clf.predict(count_test)
score_cnt_pa = metrics.accuracy_score(y_test, pred)
score_cnt_pa = round(score_cnt_pa,3)
print("accuracy:   %0.3f" % score_cnt_pa)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_pa_cnt = (2*P*R)/(P+R)
print(F1_pa_cnt)



accuracy:   0.953
0.7724631202503353


In [82]:
rf = RandomForestClassifier(random_state=100)
rf.fit(count_train, y_train)
pred = rf.predict(count_test)
score_cnt_rf = metrics.accuracy_score(y_test, pred)
score_cnt_rf = round(score_cnt_rf,3)
print("accuracy:   %0.3f" % score_cnt_rf)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_rf_cnt = (2*P*R)/(P+R)
print(F1_rf_cnt)

accuracy:   0.914
0.3796109993293092


In [83]:
dt = DecisionTreeClassifier(random_state=100,max_depth=2)
dt.fit(count_train, y_train)
pred = dt.predict(count_test)
score_cnt_dt = metrics.accuracy_score(y_test, pred)
score_cnt_dt = round(score_cnt_dt,3)
print("accuracy:   %0.3f" % score_cnt_dt)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_dt_cnt = (2*P*R)/(P+R)
print(F1_dt_cnt)

accuracy:   0.908
0.2875536480686695


In [84]:
lr = LogisticRegression()
lr.fit(count_train, y_train)
pred = lr.predict(count_test)
score_cnt_lr = metrics.accuracy_score(y_test, pred)
score_cnt_lr = round(score_cnt_lr,3)
print("accuracy:   %0.3f" % score_cnt_lr)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_lr_cnt = (2*P*R)/(P+R)
print(F1_lr_cnt)

accuracy:   0.956
0.7812354853692521


In [85]:
ada = AdaBoostClassifier()
ada.fit(count_train, y_train)
pred = ada.predict(count_test)
score_cnt_ada = metrics.accuracy_score(y_test, pred)
score_cnt_ada = round(score_cnt_ada,3)
print("accuracy:   %0.3f" % score_cnt_ada)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_ada_cnt = (2*P*R)/(P+R)
print(F1_ada_cnt)

accuracy:   0.940
0.6796019900497512


In [55]:
mlp = MLPClassifier(hidden_layer_sizes=(5,5,2)) # initially(554_0.787)(573_0.556)(544_96.1)
mlp.fit(count_train, y_train)
pred = mlp.predict(count_test)
score= metrics.accuracy_score(y_test, pred)
score_cnt_mlp=round(score,3)
#print(score_knn_tfidf)
print("accuracy:   %0.3f" % score)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_mlp_cnt = (2*P*R)/(P+R)
print(F1_mlp_cnt)

accuracy:   0.954
0.7805309734513275


In [117]:
svm1 = SVC(C=1.0,gamma=0.1,kernel='sigmoid',random_state=50)
svm1.fit(count_train, y_train)
pred = svm1.predict(count_test)
score_cnt_svm1 = metrics.accuracy_score(y_test, pred)
score_cnt_svm1 = round(score_cnt_svm1,3)
print("accuracy:   %0.3f" % score_cnt_svm1)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_svm_cnt = (2*P*R)/(P+R)
print(F1_svm_cnt)

accuracy:   0.801
0.11303630363036304


In [87]:
knn = KNeighborsClassifier()
knn.fit(count_train, y_train)
pred = knn.predict(count_test)
score_cnt_knn = metrics.accuracy_score(y_test, pred)
score_cnt_knn = round(score_cnt_knn,3)
print("accuracy:   %0.3f" % score_cnt_knn)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_knn_cnt = (2*P*R)/(P+R)
print(F1_knn_cnt)

accuracy:   0.898
0.16067329762815608


In [118]:
acc1 = {'Passive Aggresive':score_cnt_pa,'Logistic':score_cnt_lr,'AdaBoost':score_cnt_ada,'NB':score_cnt_nb,
       'RF':score_cnt_rf,'DT':score_cnt_dt,'KNN':score_cnt_knn,"MLP_NN":score_cnt_mlp,"SVM":score_cnt_svm1}

F1_cnt = {'Passive Aggresive':F1_pa_cnt,'Logistic':F1_lr_cnt,'AdaBoost':F1_ada_cnt,'NB':F1_nb_cnt,
          'RF':F1_rf_cnt,'DT':F1_dt_cnt,'KNN':F1_knn_cnt,"MLP_NN":F1_mlp_cnt,"SVM":F1_svm_cnt}
acc_cnt = pd.DataFrame([acc1,F1_cnt])
acc_cnt.index=['Model Accuracy_cnt','F1 Score_cnt']
acc_cnt

Unnamed: 0,AdaBoost,DT,KNN,Logistic,MLP_NN,NB,Passive Aggresive,RF,SVM
Model Accuracy_cnt,0.94,0.908,0.898,0.956,0.954,0.925,0.953,0.914,0.801
F1 Score_cnt,0.679602,0.287554,0.160673,0.781235,0.780531,0.698696,0.772463,0.379611,0.113036


In [63]:
hashing_vectorizer = HashingVectorizer(stop_words='english', non_negative=True)

In [69]:
hash_train = hashing_vectorizer.fit(X_train)
hash_train = hashing_vectorizer.transform(X_train)
hash_test = hashing_vectorizer.transform(X_test)



In [90]:
ada = AdaBoostClassifier()
ada.fit(hash_train, y_train)
pred = ada.predict(hash_test)
score_hash_ada = metrics.accuracy_score(y_test, pred)
score_hash_ada = round(score_hash_ada,3)
print("accuracy:   %0.3f" % score_hash_ada)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_ada_hash = (2*P*R)/(P+R)
print(F1_ada_hash)

accuracy:   0.941
0.6874695270599707


In [92]:
dt = DecisionTreeClassifier(random_state=100,max_depth=2)
dt.fit(hash_train, y_train)
pred = dt.predict(hash_test)
score_hash_dt = metrics.accuracy_score(y_test, pred)
score_hash_dt = round(score_hash_dt,3)
print("accuracy:   %0.3f" % score_hash_dt)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_dt_hash = (2*P*R)/(P+R)
print(F1_dt_hash)

accuracy:   0.908
0.2875536480686695


In [93]:
knn = KNeighborsClassifier()
knn.fit(hash_train, y_train)
pred = knn.predict(hash_test)
score_hash_knn = metrics.accuracy_score(y_test, pred)
score_hash_knn = round(score_hash_knn,3)
print("accuracy:   %0.3f" % score_hash_knn)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_knn_hash = (2*P*R)/(P+R)
print(F1_knn_hash)

accuracy:   0.929
0.5949367088607594


In [94]:
lr = LogisticRegression()
lr.fit(hash_train, y_train)
pred = lr.predict(hash_test)
score_hash_lr = metrics.accuracy_score(y_test, pred)
score_hash_lr = round(score_hash_lr,3)
print("accuracy:   %0.3f" % score_hash_lr)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_lr_hash = (2*P*R)/(P+R)
print(F1_lr_hash)

accuracy:   0.944
0.6842105263157894


In [95]:
mlp = MLPClassifier(hidden_layer_sizes=(5,4,4)) # initially(554_0.787)(573_0.556)(544_96.1)
mlp.fit(hash_train, y_train)
pred = mlp.predict(hash_test)
score_hash_mlp= metrics.accuracy_score(y_test, pred)
score_hash_mlp=round(score_hash_mlp,3)
#print(score_knn_tfidf)
print("accuracy:   %0.3f" % score_hash_mlp)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_mlp_hash = (2*P*R)/(P+R)
print(F1_mlp_hash)

accuracy:   0.956
0.7643248629795715




In [96]:
clf = MultinomialNB(alpha=0.01)
clf.fit(hash_train, y_train)
pred = clf.predict(hash_test)
score_hash_nb = metrics.accuracy_score(y_test, pred)
score_hash_nb = round(score_hash_nb,3)
print("accuracy:   %0.3f" % score)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_nb_hash = (2*P*R)/(P+R)
print(F1_nb_hash)

accuracy:   0.891
0.49092047589229804


In [72]:
linear_clf = PassiveAggressiveClassifier(n_iter=50)

linear_clf.fit(hash_train, y_train)
pred = linear_clf.predict(hash_test)
score_hash_pa = metrics.accuracy_score(y_test, pred)
score_hash_pa = round(score_hash_pa,3)
print("accuracy:   %0.3f" % score_hash_pa)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_pa_hash = (2*P*R)/(P+R)
print(F1_pa_hash)



accuracy:   0.957
0.7887323943661971


In [73]:
rf = RandomForestClassifier(random_state=100)
rf.fit(hash_train, y_train)
pred = rf.predict(hash_test)
score_hash_rf = metrics.accuracy_score(y_test, pred)
score_hash_rf = round(score_hash_rf,3)
print("accuracy:   %0.3f" % score_hash_rf)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_rf_hash = (2*P*R)/(P+R)
print(F1_rf_hash)

accuracy:   0.906
0.25387453874538746


In [112]:
svm1 = SVC(C=1.0,gamma=0.1,kernel='rbf',random_state=50)
svm1.fit(hash_train, y_train)
pred = svm1.predict(hash_test)
score_hash_svm1 = metrics.accuracy_score(y_test, pred)
score_hash_svm1 = round(score_hash_svm1,3)
print("accuracy:   %0.3f" % score_hash_svm1)
tn,fp,fn,tp = metrics.confusion_matrix(y_test, pred, labels=['non-propaganda', 'propaganda']).ravel()
R = (tp/(tp+fn)) # recall/sensitivity
P = (tp/(tp+fp)) # precision
F1_svm_hash = (2*P*R)/(P+R)
print(F1_svm_hash)

accuracy:   0.938
0.6186392224128072


In [127]:
acc3 = {'Passive Aggresive':score_hash_pa,'Logistic':score_hash_lr,'AdaBoost':score_hash_ada,'NB':score_hash_nb,
       'RF':score_hash_rf,'DT':score_hash_dt,'KNN':score_hash_knn,"MLP_NN":score_hash_mlp,"SVM":score_hash_svm1}

F1_hash = {'Passive Aggresive':F1_pa_hash,'Logistic':F1_lr_hash,'AdaBoost':F1_ada_hash,'NB':F1_nb_hash,
          'RF':F1_rf_hash,'DT':F1_dt_hash,'KNN':F1_knn_hash,"MLP_NN":F1_mlp_hash,"SVM":F1_svm_hash}
acc_hash = pd.DataFrame([acc3,F1_hash])
acc_hash.index=['Model Accuracy_hash','F1 Score_hash']
acc_hash

Unnamed: 0,AdaBoost,DT,KNN,Logistic,MLP_NN,NB,Passive Aggresive,RF,SVM
Model Accuracy_hash,0.941,0.908,0.929,0.944,0.956,0.925,0.957,0.906,0.938
F1 Score_hash,0.68747,0.287554,0.594937,0.684211,0.764325,0.49092,0.788732,0.253875,0.618639


In [142]:
performance = acc_cnt
performance=performance.append(acc_tfidf, ignore_index=False)
p=performance.append(acc_hash, ignore_index=False)
p

Unnamed: 0,AdaBoost,DT,KNN,Logistic,MLP_NN,NB,Passive Aggresive,RF,SVM
Model Accuracy_cnt,0.94,0.908,0.898,0.956,0.954,0.925,0.953,0.914,0.801
F1 Score_cnt,0.679602,0.287554,0.160673,0.781235,0.780531,0.698696,0.772463,0.379611,0.113036
Model Accuracy_tfidf,0.939,0.922,0.925,0.943,0.961,0.946,0.96,0.912,0.936
F1 Score_tfidf,0.678082,0.64186,0.553425,0.664488,0.811738,0.729867,0.79963,0.343967,0.589001
Model Accuracy_hash,0.941,0.908,0.929,0.944,0.956,0.925,0.957,0.906,0.938
F1 Score_hash,0.68747,0.287554,0.594937,0.684211,0.764325,0.49092,0.788732,0.253875,0.618639


In [140]:
p.to_csv("C:/Users/Administrator/Downloads/v5/table1.train.csv", encoding='utf-8')

In [128]:
F1_hash = {'Passive Aggresive':F1_pa_hash,'Logistic':F1_lr_hash,'AdaBoost':F1_ada_hash,'NB':F1_nb_hash,
          'RF':F1_rf_hash,'DT':F1_dt_hash,'KNN':F1_knn_hash,"MLP_NN":F1_mlp_hash,"SVM":F1_svm_hash}
acc_hash1 = pd.DataFrame([F1_hash])
acc_hash1.index=['F1 Score_hash']
acc_hash1

Unnamed: 0,AdaBoost,DT,KNN,Logistic,MLP_NN,NB,Passive Aggresive,RF,SVM
F1 Score_hash,0.68747,0.287554,0.594937,0.684211,0.764325,0.49092,0.788732,0.253875,0.618639


In [129]:
F1_cnt = {'Passive Aggresive':F1_pa_cnt,'Logistic':F1_lr_cnt,'AdaBoost':F1_ada_cnt,'NB':F1_nb_cnt,
          'RF':F1_rf_cnt,'DT':F1_dt_cnt,'KNN':F1_knn_cnt,"MLP_NN":F1_mlp_cnt,"SVM":F1_svm_cnt}
acc_cnt1 = pd.DataFrame([F1_cnt])
acc_cnt1.index=['F1 Score_cnt']
acc_cnt1

Unnamed: 0,AdaBoost,DT,KNN,Logistic,MLP_NN,NB,Passive Aggresive,RF,SVM
F1 Score_cnt,0.679602,0.287554,0.160673,0.781235,0.780531,0.698696,0.772463,0.379611,0.113036


In [130]:

F1_tfidf = {'Passive Aggresive':F1_pa,'Logistic':F1_lr,'AdaBoost':F1_ada,'NB':F1_nb,
      'RF':F1_rf,'DT':F1_dt,'KNN':F1_knn,"MLP_NN":F1_mlp,"SVM":F1_svm}
acc_tfidf1 = pd.DataFrame([F1_tfidf])
acc_tfidf1.index=['F1 Score_tfidf']
acc_tfidf1

Unnamed: 0,AdaBoost,DT,KNN,Logistic,MLP_NN,NB,Passive Aggresive,RF,SVM
F1 Score_tfidf,0.678082,0.64186,0.553425,0.664488,0.811738,0.729867,0.79963,0.343967,0.589001


In [138]:
performance1 = acc_cnt1
performance1=performance1.append(acc_tfidf1, ignore_index=False)
p1=performance1.append(acc_hash1, ignore_index=False)


In [141]:
p1.to_csv("C:/Users/Administrator/Downloads/v5/table2.train.csv", encoding='utf-8')