In [1]:
import json
import nltk 
import string
import copy
import random
from os import listdir
import pickle



import numpy as np

import string
import re

from nltk.stem.porter import *
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier,LinearRegression
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score


from pos_tagger import tag

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
all_dicts = pickle.load( open( "./data/clean_data.p", "rb" ) )

article_text_dict_positive=all_dicts["article_text_dict_positive"]
iter1_BBC_text_dict_neg=all_dicts["iter1_BBC_text_dict_neg"]
iter2_BBC_text_dict_neg=all_dicts["iter2_BBC_text_dict_neg"]
iter1_CNN_neg_text=all_dicts["iter1_CNN_neg_text"]

In [3]:
print("",len(iter1_BBC_text_dict_neg))
print("",len(iter2_BBC_text_dict_neg))
print("",len(iter1_CNN_neg_text))
print("",len(article_text_dict_positive))

 1038
 1043
 948
 1037


In [4]:
lowercase=' abcdefghijklmnoprstuvyzğöıüşç'
stemmer = TurkishStemmer()

# There are different tokenization functions

# first one is tokenizes with nltk, lowers text 
# and applies stemming to each word
def tokenize_stem(text):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems text. Returns a list of them"""
    text=nltk.word_tokenize(text.lower().strip())
    tokens = [stemmer.stem(t) for t in text]
    return tokens


# lowers text
# generates part of speech tags  
# returns "word+tag" 
# tokenization made with tagging library
def pos_tokenize(text):
    text=text.lower().strip()
    tokens=[]
    tags=tag(text)
    for a_tag in tags:
        tokens.append(a_tag[0]+"+"+a_tag[1])
    return tokens

# lowers text
# splits text to it's characters
def char_tokenize(text):
    text = text.lower().strip()
    tokens = [t for t in text]
    return tokens



In [5]:
# list of feature functions to be tested 
features={"Pos_tags":pos_tokenize,"characters":char_tokenize,"tokenize":nltk.word_tokenize,"tokenize_stem":tokenize_stem}


In [6]:
def run_experiment_w_features(X_train,X_test,y_train,y_test,feature,n_range,theclassifiers,vectorizer=None):
    if vectorizer==None:
        vectorizer = TfidfVectorizer(
            tokenizer= features[feature],
#             cleaning itself
#             preprocessor=word_process_clean,
            ngram_range=n_range,
            use_idf=True,
            min_df=0.003,
            norm=None, )
        
    
    X_train = vectorizer.fit_transform(X_train).toarray()
    X_test = vectorizer.transform(X_test).toarray()

    if type(theclassifiers)!=list:
        theclassifiers=[theclassifiers]
    for theclassifier in theclassifiers:
#       initialise classifier
        clf= theclassifier()
#     create model with tranining data
        model = clf.fit(X_train, y_train)
#     predict test set
        y_preds = model.predict(X_test)
#     create the report
        report = classification_report( y_test, y_preds )
    
#     find name of the classifier for printing
        match=re.search(r"\.([A-z]*)'>",str(theclassifier))
        match=match.group(1)

        result_text="\033[1m Performance report of \033[0m \033[92m" + feature +"\033[0m "
        count=0
        for i in range(n_range[0],n_range[1]+1):
            if count!=0:
                result_text+=" and "
            result_text=result_text+"\033[91m"+str(i)+"-gram\033[0m"
            count+=1
        result_text+= " with \033[94m"+match+"\033[0m"
    #     print("Performance report of {} {}-gram".format(feature,n_range[1]))
        print (result_text)
#         prnt_scores(report)
        print(report)
        print(accuracy_score( y_test, y_preds))


In [7]:
pos_examp=[]
neg_examp_train=[]
neg_examp_test=[]

print("pos examples",len(article_text_dict_positive))
print("BBC",len(iter2_BBC_text_dict_neg))
print("CNN",len(iter1_CNN_neg_text))

# some articles includes short text 
for artc in article_text_dict_positive.values():
    pos_examp.append((artc,1))

for artc in iter2_BBC_text_dict_neg.values():
    neg_examp_train.append((artc,0))

for artc in iter1_CNN_neg_text.values():
    neg_examp_test.append((artc,0))

print("pos examples",len(pos_examp))
print("BBC",len(pos_examp))
print("CNN",len(neg_examp_test))

# combine positive and negative samples then shuffle
# XY=pos_examp+neg_examp_train
random.seed(a=2)
random.shuffle(pos_examp)
random.shuffle(neg_examp_train)
random.shuffle(neg_examp_test)

percentage=0.8
cut_point=int(len(pos_examp)*percentage)
train_pos=pos_examp[:cut_point]
test_pos=pos_examp[cut_point:]

cut_point=int(len(neg_examp_train)*percentage)
train_neg=neg_examp_train[:cut_point]
cut_point=int(len(neg_examp_test)*percentage)
test_neg=neg_examp_test[cut_point:]

print("train_pos",len(train_pos),"test_pos",len(test_pos))
print("train_neg",len(train_neg),"test_neg",len(test_neg))

XY_train=train_pos+train_neg
random.shuffle(XY_train)
X_train=[k[0] for k in XY_train]
Y_train=[k[1] for k in XY_train]

XY_test=test_pos+test_neg
random.shuffle(XY_test)
X_test=[k[0] for k in XY_test]
Y_test=[k[1] for k in XY_test]


print("X_train",len(X_train),"Y_train",len(Y_train),"X_test",len(X_test),"Y_test",len(Y_test))



pos examples 1037
BBC 1043
CNN 948
pos examples 1037
BBC 1037
CNN 948
train_pos 829 test_pos 208
train_neg 834 test_neg 190
X_train 1663 Y_train 1663 X_test 398 Y_test 398


In [8]:
classifiers_list=[LinearSVC,MultinomialNB,RandomForestClassifier,
                  AdaBoostClassifier]
tryfeatures={"tokenize":nltk.word_tokenize,"tokenize_stem":tokenize_stem,"Pos_tags":pos_tokenize,"characters":char_tokenize}



In [13]:
for clasifier in classifiers_list:
    for feature in tryfeatures:
        for i in range(1,4):
            for k in range(i,4):
                run_experiment_w_features(X_train[:],X_test[:],Y_train[:],Y_test[:],feature,(i,k),clasifier)




[1m Performance report of [0m [92mtokenize[0m [91m1-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.94      0.70      0.80       190
           1       0.78      0.96      0.86       208

   micro avg       0.84      0.84      0.84       398
   macro avg       0.86      0.83      0.83       398
weighted avg       0.86      0.84      0.83       398

0.8366834170854272




[1m Performance report of [0m [92mtokenize[0m [91m1-gram[0m and [91m2-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.96      0.69      0.80       190
           1       0.77      0.97      0.86       208

   micro avg       0.84      0.84      0.84       398
   macro avg       0.87      0.83      0.83       398
weighted avg       0.86      0.84      0.83       398

0.8366834170854272




[1m Performance report of [0m [92mtokenize[0m [91m1-gram[0m and [91m2-gram[0m and [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.96      0.69      0.80       190
           1       0.78      0.97      0.86       208

   micro avg       0.84      0.84      0.84       398
   macro avg       0.87      0.83      0.83       398
weighted avg       0.86      0.84      0.84       398

0.8391959798994975




[1m Performance report of [0m [92mtokenize[0m [91m2-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.81      0.42      0.55       190
           1       0.63      0.91      0.75       208

   micro avg       0.68      0.68      0.68       398
   macro avg       0.72      0.66      0.65       398
weighted avg       0.72      0.68      0.65       398

0.6758793969849246




[1m Performance report of [0m [92mtokenize[0m [91m2-gram[0m and [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.83      0.42      0.56       190
           1       0.64      0.92      0.75       208

   micro avg       0.68      0.68      0.68       398
   macro avg       0.73      0.67      0.66       398
weighted avg       0.73      0.68      0.66       398

0.6834170854271356




[1m Performance report of [0m [92mtokenize[0m [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.75      0.22      0.33       190
           1       0.57      0.93      0.70       208

   micro avg       0.59      0.59      0.59       398
   macro avg       0.66      0.57      0.52       398
weighted avg       0.65      0.59      0.53       398

0.5904522613065326
[1m Performance report of [0m [92mtokenize_stem[0m [91m1-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.96      0.72      0.82       190
           1       0.79      0.98      0.87       208

   micro avg       0.85      0.85      0.85       398
   macro avg       0.88      0.85      0.85       398
weighted avg       0.87      0.85      0.85       398

0.8517587939698492
[1m Performance report of [0m [92mtokenize_stem[0m [91m1-gram[0m and [91m2-gram[0m with [94mLinearSVC[0m
        



[1m Performance report of [0m [92mtokenize_stem[0m [91m2-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.86      0.55      0.67       190
           1       0.69      0.92      0.79       208

   micro avg       0.74      0.74      0.74       398
   macro avg       0.78      0.74      0.73       398
weighted avg       0.77      0.74      0.73       398

0.7437185929648241




[1m Performance report of [0m [92mtokenize_stem[0m [91m2-gram[0m and [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.86      0.55      0.67       190
           1       0.69      0.92      0.79       208

   micro avg       0.74      0.74      0.74       398
   macro avg       0.77      0.73      0.73       398
weighted avg       0.77      0.74      0.73       398

0.7412060301507538




[1m Performance report of [0m [92mtokenize_stem[0m [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.71      0.28      0.40       190
           1       0.58      0.89      0.70       208

   micro avg       0.60      0.60      0.60       398
   macro avg       0.64      0.59      0.55       398
weighted avg       0.64      0.60      0.56       398

0.6005025125628141




[1m Performance report of [0m [92mPos_tags[0m [91m1-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.94      0.70      0.80       190
           1       0.78      0.96      0.86       208

   micro avg       0.84      0.84      0.84       398
   macro avg       0.86      0.83      0.83       398
weighted avg       0.86      0.84      0.83       398

0.8366834170854272




[1m Performance report of [0m [92mPos_tags[0m [91m1-gram[0m and [91m2-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.96      0.69      0.80       190
           1       0.77      0.97      0.86       208

   micro avg       0.84      0.84      0.84       398
   macro avg       0.87      0.83      0.83       398
weighted avg       0.86      0.84      0.83       398

0.8366834170854272




[1m Performance report of [0m [92mPos_tags[0m [91m1-gram[0m and [91m2-gram[0m and [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.96      0.69      0.80       190
           1       0.78      0.97      0.86       208

   micro avg       0.84      0.84      0.84       398
   macro avg       0.87      0.83      0.83       398
weighted avg       0.86      0.84      0.84       398

0.8391959798994975




[1m Performance report of [0m [92mPos_tags[0m [91m2-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.81      0.42      0.55       190
           1       0.63      0.91      0.75       208

   micro avg       0.68      0.68      0.68       398
   macro avg       0.72      0.66      0.65       398
weighted avg       0.72      0.68      0.65       398

0.6758793969849246




[1m Performance report of [0m [92mPos_tags[0m [91m2-gram[0m and [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.83      0.42      0.56       190
           1       0.64      0.92      0.75       208

   micro avg       0.68      0.68      0.68       398
   macro avg       0.73      0.67      0.66       398
weighted avg       0.73      0.68      0.66       398

0.6834170854271356




[1m Performance report of [0m [92mPos_tags[0m [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.75      0.22      0.33       190
           1       0.57      0.93      0.70       208

   micro avg       0.59      0.59      0.59       398
   macro avg       0.66      0.57      0.52       398
weighted avg       0.65      0.59      0.53       398

0.5904522613065326




[1m Performance report of [0m [92mcharacters[0m [91m1-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.80      0.72      0.76       190
           1       0.76      0.84      0.80       208

   micro avg       0.78      0.78      0.78       398
   macro avg       0.78      0.78      0.78       398
weighted avg       0.78      0.78      0.78       398

0.7788944723618091




[1m Performance report of [0m [92mcharacters[0m [91m1-gram[0m and [91m2-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.94      0.61      0.74       190
           1       0.73      0.97      0.83       208

   micro avg       0.79      0.79      0.79       398
   macro avg       0.84      0.79      0.78       398
weighted avg       0.83      0.79      0.79       398

0.7939698492462312




[1m Performance report of [0m [92mcharacters[0m [91m1-gram[0m and [91m2-gram[0m and [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.96      0.64      0.77       190
           1       0.75      0.98      0.85       208

   micro avg       0.81      0.81      0.81       398
   macro avg       0.85      0.81      0.81       398
weighted avg       0.85      0.81      0.81       398

0.8140703517587939




[1m Performance report of [0m [92mcharacters[0m [91m2-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.92      0.63      0.75       190
           1       0.74      0.95      0.83       208

   micro avg       0.80      0.80      0.80       398
   macro avg       0.83      0.79      0.79       398
weighted avg       0.83      0.80      0.79       398

0.7964824120603015




[1m Performance report of [0m [92mcharacters[0m [91m2-gram[0m and [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.95      0.64      0.77       190
           1       0.75      0.97      0.85       208

   micro avg       0.81      0.81      0.81       398
   macro avg       0.85      0.81      0.81       398
weighted avg       0.85      0.81      0.81       398

0.8140703517587939
[1m Performance report of [0m [92mcharacters[0m [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.97      0.66      0.79       190
           1       0.76      0.98      0.86       208

   micro avg       0.83      0.83      0.83       398
   macro avg       0.87      0.82      0.82       398
weighted avg       0.86      0.83      0.82       398

0.8291457286432161
[1m Performance report of [0m [92mtokenize[0m [91m1-gram[0m with [94mMultinomialNB[0m
          

[1m Performance report of [0m [92mPos_tags[0m [91m3-gram[0m with [94mMultinomialNB[0m
              precision    recall  f1-score   support

           0       0.76      0.35      0.48       190
           1       0.60      0.90      0.72       208

   micro avg       0.64      0.64      0.64       398
   macro avg       0.68      0.62      0.60       398
weighted avg       0.68      0.64      0.60       398

0.635678391959799
[1m Performance report of [0m [92mcharacters[0m [91m1-gram[0m with [94mMultinomialNB[0m
              precision    recall  f1-score   support

           0       0.73      0.77      0.75       190
           1       0.78      0.75      0.76       208

   micro avg       0.76      0.76      0.76       398
   macro avg       0.76      0.76      0.76       398
weighted avg       0.76      0.76      0.76       398

0.7587939698492462
[1m Performance report of [0m [92mcharacters[0m [91m1-gram[0m and [91m2-gram[0m with [94mMultinomialNB[0m
   

[1m Performance report of [0m [92mtokenize_stem[0m [91m3-gram[0m with [94mRandomForestClassifier[0m
              precision    recall  f1-score   support

           0       0.74      0.29      0.42       190
           1       0.58      0.91      0.71       208

   micro avg       0.61      0.61      0.61       398
   macro avg       0.66      0.60      0.56       398
weighted avg       0.66      0.61      0.57       398

0.6130653266331658
[1m Performance report of [0m [92mPos_tags[0m [91m1-gram[0m with [94mRandomForestClassifier[0m
              precision    recall  f1-score   support

           0       0.84      0.83      0.84       190
           1       0.85      0.86      0.85       208

   micro avg       0.85      0.85      0.85       398
   macro avg       0.85      0.85      0.85       398
weighted avg       0.85      0.85      0.85       398

0.8467336683417085
[1m Performance report of [0m [92mPos_tags[0m [91m1-gram[0m and [91m2-gram[0m with [94mR

[1m Performance report of [0m [92mtokenize[0m [91m3-gram[0m with [94mAdaBoostClassifier[0m
              precision    recall  f1-score   support

           0       0.89      0.08      0.15       190
           1       0.54      0.99      0.70       208

   micro avg       0.56      0.56      0.56       398
   macro avg       0.72      0.54      0.43       398
weighted avg       0.71      0.56      0.44       398

0.5577889447236181
[1m Performance report of [0m [92mtokenize_stem[0m [91m1-gram[0m with [94mAdaBoostClassifier[0m
              precision    recall  f1-score   support

           0       0.83      0.61      0.70       190
           1       0.71      0.88      0.79       208

   micro avg       0.75      0.75      0.75       398
   macro avg       0.77      0.74      0.74       398
weighted avg       0.77      0.75      0.75       398

0.7512562814070352
[1m Performance report of [0m [92mtokenize_stem[0m [91m1-gram[0m and [91m2-gram[0m with [94mAdaB

[1m Performance report of [0m [92mcharacters[0m [91m3-gram[0m with [94mAdaBoostClassifier[0m
              precision    recall  f1-score   support

           0       0.91      0.45      0.60       190
           1       0.66      0.96      0.78       208

   micro avg       0.72      0.72      0.72       398
   macro avg       0.78      0.70      0.69       398
weighted avg       0.78      0.72      0.69       398

0.7160804020100503


In [14]:
slow_classifiers_list=[DecisionTreeClassifier,GaussianNB,MLPClassifier]
for clasifier in slow_classifiers_list:
    for feature in tryfeatures:
        for i in range(1,4):
            run_experiment_w_features(X_train[:],X_test[:],Y_train[:],Y_test[:],feature,(i,i),clasifier)

[1m Performance report of [0m [92mtokenize[0m [91m1-gram[0m with [94mDecisionTreeClassifier[0m
              precision    recall  f1-score   support

           0       0.76      0.61      0.68       190
           1       0.70      0.83      0.76       208

   micro avg       0.72      0.72      0.72       398
   macro avg       0.73      0.72      0.72       398
weighted avg       0.73      0.72      0.72       398

0.7236180904522613
[1m Performance report of [0m [92mtokenize[0m [91m2-gram[0m with [94mDecisionTreeClassifier[0m
              precision    recall  f1-score   support

           0       0.64      0.47      0.54       190
           1       0.61      0.75      0.67       208

   micro avg       0.62      0.62      0.62       398
   macro avg       0.62      0.61      0.61       398
weighted avg       0.62      0.62      0.61       398

0.6180904522613065
[1m Performance report of [0m [92mtokenize[0m [91m3-gram[0m with [94mDecisionTreeClassifier[0m

[1m Performance report of [0m [92mPos_tags[0m [91m2-gram[0m with [94mGaussianNB[0m
              precision    recall  f1-score   support

           0       0.78      0.55      0.64       190
           1       0.68      0.86      0.76       208

   micro avg       0.71      0.71      0.71       398
   macro avg       0.73      0.70      0.70       398
weighted avg       0.73      0.71      0.70       398

0.7110552763819096
[1m Performance report of [0m [92mPos_tags[0m [91m3-gram[0m with [94mGaussianNB[0m
              precision    recall  f1-score   support

           0       0.83      0.21      0.33       190
           1       0.57      0.96      0.72       208

   micro avg       0.60      0.60      0.60       398
   macro avg       0.70      0.58      0.52       398
weighted avg       0.69      0.60      0.53       398

0.6005025125628141
[1m Performance report of [0m [92mcharacters[0m [91m1-gram[0m with [94mGaussianNB[0m
              precision    recall

In [15]:
# y_preds,y_test=run_experiment_w_features(X_train[:],X_test[:],Y_train[:],Y_test[:],"tokenize",(3,3),DecisionTreeClassifier)



