In [4]:
import io
import pickle
import nltk
import numpy as np
import random

import copy

from nltk.stem.porter import *
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier,LinearRegression
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:

fname="./data/word2vec.txt"
myfile=open(fname,"r")

word2vec={}

for line in myfile.readlines():
    line=line.strip().split()
    word2vec[line[0]]=np.fromiter((map(lambda x:float(x), line[1:])),float)

myfile.close()

all_dicts = pickle.load( open( "./data/clean_data.p", "rb" ) )

article_text_dict_positive=all_dicts["article_text_dict_positive"]
iter1_BBC_text_dict_neg=all_dicts["iter1_BBC_text_dict_neg"]
iter2_BBC_text_dict_neg=all_dicts["iter2_BBC_text_dict_neg"]
iter1_CNN_neg_text=all_dicts["iter1_CNN_neg_text"]



In [5]:
pos_examp=[]
neg_examp_train=[]
neg_examp_test=[]

for artc in article_text_dict_positive.values():
    pos_examp.append((artc,1))

for artc in iter2_BBC_text_dict_neg.values():
    neg_examp_train.append((artc,0))

for artc in iter1_CNN_neg_text:
    neg_examp_test.append((artc,0))

# combine positive and negative samples then shuffle
# XY=pos_examp+neg_examp_train
random.seed(a=2)
random.shuffle(pos_examp)
random.shuffle(neg_examp_train)
random.shuffle(neg_examp_test)

percentage=0.8
cut_point=int(len(pos_examp)*percentage)
train_pos=pos_examp[:cut_point]
test_pos=pos_examp[cut_point:]

cut_point=int(len(neg_examp_train)*percentage)
train_neg=neg_examp_train[:cut_point]
cut_point=int(len(neg_examp_train)*percentage)
test_neg=neg_examp_train[cut_point:]

print("train_pos",len(train_pos),"test_pos",len(test_pos))
print("train_neg",len(train_neg),"test_neg",len(test_neg))

XY_train=train_pos+train_neg
random.shuffle(XY_train)
X_train=[k[0] for k in XY_train]
Y_train=[k[1] for k in XY_train]

XY_test=test_pos+test_neg
random.shuffle(XY_test)
X_test=[k[0] for k in XY_test]
Y_test=[k[1] for k in XY_test]

print("X_train",len(X_train),"Y_train",len(Y_train),"X_test",len(X_test),"Y_test",len(Y_test))



train_pos 829 test_pos 208
train_neg 834 test_neg 209
X_train 1663 Y_train 1663 X_test 417 Y_test 417


In [6]:
Y_test=np.array(Y_test)
Y_train= np.array(Y_train)

In [7]:
# transfer document string to vectors by averaging word vectors 
# document = word2vec(tokenize(text_of_document))
# document_vec = sum(document,1) / len(document)

X_train_vec = []

for X in X_train:
    total=np.zeros(300,)
    count=0
    for word in nltk.word_tokenize(X):
        count+=1
        total+=word2vec[word]
    total/=count
    X_train_vec.append(total)

X_test_vec = []

for X in X_test:
    total=np.zeros(300,)
    count=0
    for word in nltk.word_tokenize(X):
        count+=1
        total+=word2vec[word]
    total/=count
    X_test_vec.append(total)


In [8]:
def run_experiment_w_features(X_train,X_test,y_train,y_test,theclassifiers):

    if type(theclassifiers)!=list:
        theclassifiers=[theclassifiers]
    for theclassifier in theclassifiers:
#        initialise classifier
        if theclassifier==MLPClassifier:
            clf= theclassifier(max_iter=1000)
        else:
            clf= theclassifier()
#     create model with tranining data
        model = clf.fit(X_train, y_train)
#     predict test set
        y_preds = model.predict(X_test)
#     create the report
        report = classification_report( y_test, y_preds )
    
#     find name of the classifier for printing
        match=re.search(r"\.([A-z]*)'>",str(theclassifier))
        match=match.group(1)

        result_text="\033[1m Performance report of \033[0m \033[92m" + "word2vec" +"\033[0m "
        count=0
#         for i in range(n_range[0],n_range[1]+1):
#             if count!=0:
#                 result_text+=" and "
#             result_text=result_text+"\033[91m"+str(i)+"-gram\033[0m"
#             count+=1
        result_text+= " with \033[94m"+match+"\033[0m"
    #     print("Performance report of {} {}-gram".format(feature,n_range[1]))
        print (result_text)
#         prnt_scores(report)
        print(report)
        print(accuracy_score( y_test, y_preds))


In [9]:
classifiers_list=[LinearSVC,GaussianNB,RandomForestClassifier,AdaBoostClassifier,MLPClassifier]

In [10]:
for clasifier in classifiers_list:
        run_experiment_w_features(X_train_vec[:],X_test_vec[:],Y_train[:],Y_test[:],clasifier)
              precision    recall  f1-score 

# LinearSVC     0.94      0.94      0.94
# GaussianNB    0.93      0.92      0.92 
# RandomForest  0.93      0.92      0.92
# AdaBoost      0.94      0.94      0.94
# MLPClassifier 0.97      0.97      0.97

[1m Performance report of [0m [92mword2vec[0m  with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.90      0.99      0.94       209
           1       0.98      0.89      0.94       208

   micro avg       0.94      0.94      0.94       417
   macro avg       0.94      0.94      0.94       417
weighted avg       0.94      0.94      0.94       417

0.9400479616306955
[1m Performance report of [0m [92mword2vec[0m  with [94mGaussianNB[0m
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       209
           1       0.97      0.87      0.92       208

   micro avg       0.92      0.92      0.92       417
   macro avg       0.93      0.92      0.92       417
weighted avg       0.93      0.92      0.92       417

0.920863309352518
[1m Performance report of [0m [92mword2vec[0m  with [94mRandomForestClassifier[0m
              precision    recall  f1-score   support

           0   