In [34]:
import json
import nltk 
import string
import copy
import random
from os import listdir
import pickle



import numpy as np

import string
import re

from nltk.stem.porter import *
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier,LinearRegression
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score


from pos_tagger import tag

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [35]:
# import sys
# !conda install --yes --prefix {sys.prefix} scikit-learn
# import zipfile
# zip_ref = zipfile.ZipFile("TurkishStemmer.zip", 'r')
# zip_ref.extractall("./")
# zip_ref.close()

In [36]:
all_dicts = pickle.load( open( "./data/clean_data.p", "rb" ) )

article_text_dict_positive=all_dicts["article_text_dict_positive"]
iter1_BBC_text_dict_neg=all_dicts["iter1_BBC_text_dict_neg"]
iter2_BBC_text_dict_neg=all_dicts["iter2_BBC_text_dict_neg"]
iter1_CNN_neg_text=all_dicts["iter1_CNN_neg_text"]

In [37]:
print(len(iter1_BBC_text_dict_neg))
print(len(iter2_BBC_text_dict_neg))
print(len(iter1_CNN_neg_text))
print(len(article_text_dict_positive))

1030
1031
948
1036


In [38]:
lowercase=' abcdefghijklmnoprstuvyzğöıüşç'
stemmer = TurkishStemmer()

# There are different tokenization functions

# first one is tokenizes with nltk, lowers text 
# and applies stemming to each word
def tokenize_stem(text):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems text. Returns a list of them"""
    text=nltk.word_tokenize(text.lower().strip())
    tokens = [stemmer.stem(t) for t in text]
    return tokens


# lowers text
# generates part of speech tags  
# returns "word+tag" 
# tokenization made with tagging library
def pos_tokenize(text):
    text=text.lower().strip()
    tokens=[]
    tags=tag(text)
    for a_tag in tags:
        tokens.append(a_tag[0]+"+"+a_tag[1])
    return tokens

# lowers text
# splits text to it's characters
def char_tokenize(text):
    text = text.lower().strip()
    tokens = [t for t in text]
    return tokens



In [39]:
features={"Pos_tags":pos_tokenize,"characters":char_tokenize,"tokenize":nltk.word_tokenize,"tokenize_stem":tokenize_stem}


In [54]:
def run_experiment_w_features(X_train,X_test,y_train,y_test,feature,n_range,theclassifiers,vectorizer=None):
    if vectorizer==None:
        vectorizer = TfidfVectorizer(
            tokenizer= features[feature],
#             cleaning itself
#             preprocessor=word_process_clean,
            ngram_range=n_range,
            use_idf=True,
            min_df=0.003,
            norm=None, )
        
    
    X_train = vectorizer.fit_transform(X_train).toarray()
    X_test=vectorizer.transform(X_test).toarray()

    if type(theclassifiers)!=list:
        theclassifiers=[theclassifiers]
    for theclassifier in theclassifiers:
#       initialise classifier
        if theclassifier==LinearSVC:
            clf= theclassifier(max_iter=3000)
        else:
            clf= theclassifier()
#     create model with tranining data
        model = clf.fit(X_train, y_train)
#     predict test set
        y_preds = model.predict(X_test)
#     create the report
        report = classification_report( y_test, y_preds )
    
#     find name of the classifier for printing
        match=re.search(r"\.([A-z]*)'>",str(theclassifier))
        match=match.group(1)

        result_text="\033[1m Performance report of \033[0m \033[92m" + feature +"\033[0m "
        count=0
        for i in range(n_range[0],n_range[1]+1):
            if count!=0:
                result_text+=" and "
            result_text=result_text+"\033[91m"+str(i)+"-gram\033[0m"
            count+=1
        result_text+= " with \033[94m"+match+"\033[0m"
    #     print("Performance report of {} {}-gram".format(feature,n_range[1]))
        print (result_text)
#         prnt_scores(report)
        print(report)
        print(accuracy_score( y_test, y_preds))


In [55]:
pos_examp=[]
neg_examp_train=[]
neg_examp_test=[]

for key,artc in article_text_dict_positive.items():
    pos_examp.append((artc,1))

for artc in iter2_BBC_text_dict_neg.values():
    neg_examp_train.append((artc,0))

for artc in iter1_CNN_neg_text.values():
    neg_examp_test.append((artc,0))


# combine positive and negative samples then shuffle
# XY=pos_examp+neg_examp_train
random.seed(a=2)
random.shuffle(pos_examp)
random.shuffle(neg_examp_train)
random.shuffle(neg_examp_test)

percentage=0.8
cut_point=int(len(pos_examp)*percentage)
train_pos=pos_examp[:cut_point]
test_pos=pos_examp[cut_point:]

cut_point=int(len(neg_examp_train)*percentage)
train_neg=neg_examp_train[:cut_point]
cut_point=int(len(neg_examp_train)*percentage)
test_neg=neg_examp_train[cut_point:]

print("train_pos",len(train_pos),"test_pos",len(test_pos))
print("train_neg",len(train_neg),"test_neg",len(test_neg))

XY_train=train_pos+train_neg
random.shuffle(XY_train)
X_train=[k[0] for k in XY_train]
Y_train=[k[1] for k in XY_train]

XY_test=test_pos+test_neg
random.shuffle(XY_test)
X_test=[k[0] for k in XY_test]
Y_test=[k[1] for k in XY_test]

print("X_train",len(X_train),"Y_train",len(Y_train),"X_test",len(X_test),"Y_test",len(Y_test))



train_pos 828 test_pos 208
train_neg 824 test_neg 207
X_train 1652 Y_train 1652 X_test 415 Y_test 415


In [56]:
classifiers_list=[LinearSVC,MultinomialNB,RandomForestClassifier,
                  AdaBoostClassifier]
tryfeatures={"tokenize":nltk.word_tokenize,"tokenize_stem":tokenize_stem,"Pos_tags":pos_tokenize,"characters":char_tokenize}



In [57]:
len(neg_examp_train)


1031

In [58]:
for clasifier in classifiers_list:
    for feature in tryfeatures:
        for i in range(1,4):
            for k in range(i,4):
                run_experiment_w_features(X_train[:],X_test[:],Y_train[:],Y_test[:],feature,(i,k),clasifier)


[1m Performance report of [0m [92mtokenize[0m [91m1-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       207
           1       0.98      0.96      0.97       208

   micro avg       0.97      0.97      0.97       415
   macro avg       0.97      0.97      0.97       415
weighted avg       0.97      0.97      0.97       415

0.9686746987951808
[1m Performance report of [0m [92mtokenize[0m [91m1-gram[0m and [91m2-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       207
           1       0.98      0.97      0.97       208

   micro avg       0.97      0.97      0.97       415
   macro avg       0.97      0.97      0.97       415
weighted avg       0.97      0.97      0.97       415

0.9734939759036144
[1m Performance report of [0m [92mtokenize[0m [91m1-gram[0m and [91m2-gram[0m and [91m3-gram[0m wi



[1m Performance report of [0m [92mtokenize[0m [91m2-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.91      0.84      0.87       207
           1       0.85      0.91      0.88       208

   micro avg       0.87      0.87      0.87       415
   macro avg       0.88      0.87      0.87       415
weighted avg       0.88      0.87      0.87       415

0.8746987951807229
[1m Performance report of [0m [92mtokenize[0m [91m2-gram[0m and [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.92      0.84      0.87       207
           1       0.85      0.92      0.88       208

   micro avg       0.88      0.88      0.88       415
   macro avg       0.88      0.88      0.88       415
weighted avg       0.88      0.88      0.88       415

0.8795180722891566




[1m Performance report of [0m [92mtokenize[0m [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.90      0.58      0.70       207
           1       0.69      0.93      0.79       208

   micro avg       0.76      0.76      0.76       415
   macro avg       0.79      0.76      0.75       415
weighted avg       0.79      0.76      0.75       415

0.7566265060240964
[1m Performance report of [0m [92mtokenize_stem[0m [91m1-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       207
           1       0.96      0.98      0.97       208

   micro avg       0.97      0.97      0.97       415
   macro avg       0.97      0.97      0.97       415
weighted avg       0.97      0.97      0.97       415

0.9686746987951808
[1m Performance report of [0m [92mtokenize_stem[0m [91m1-gram[0m and [91m2-gram[0m with [94mLinearSVC[0m
        



[1m Performance report of [0m [92mtokenize_stem[0m [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.86      0.63      0.72       207
           1       0.71      0.89      0.79       208

   micro avg       0.76      0.76      0.76       415
   macro avg       0.78      0.76      0.76       415
weighted avg       0.78      0.76      0.76       415

0.7614457831325301
[1m Performance report of [0m [92mPos_tags[0m [91m1-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       207
           1       0.98      0.96      0.97       208

   micro avg       0.97      0.97      0.97       415
   macro avg       0.97      0.97      0.97       415
weighted avg       0.97      0.97      0.97       415

0.9686746987951808
[1m Performance report of [0m [92mPos_tags[0m [91m1-gram[0m and [91m2-gram[0m with [94mLinearSVC[0m
             



[1m Performance report of [0m [92mPos_tags[0m [91m2-gram[0m and [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.92      0.84      0.87       207
           1       0.85      0.92      0.88       208

   micro avg       0.88      0.88      0.88       415
   macro avg       0.88      0.88      0.88       415
weighted avg       0.88      0.88      0.88       415

0.8795180722891566




[1m Performance report of [0m [92mPos_tags[0m [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.90      0.58      0.70       207
           1       0.69      0.93      0.79       208

   micro avg       0.76      0.76      0.76       415
   macro avg       0.79      0.76      0.75       415
weighted avg       0.79      0.76      0.75       415

0.7566265060240964




[1m Performance report of [0m [92mcharacters[0m [91m1-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.88      0.88      0.88       207
           1       0.88      0.88      0.88       208

   micro avg       0.88      0.88      0.88       415
   macro avg       0.88      0.88      0.88       415
weighted avg       0.88      0.88      0.88       415

0.8819277108433735




[1m Performance report of [0m [92mcharacters[0m [91m1-gram[0m and [91m2-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.96      0.90      0.93       207
           1       0.91      0.96      0.93       208

   micro avg       0.93      0.93      0.93       415
   macro avg       0.93      0.93      0.93       415
weighted avg       0.93      0.93      0.93       415

0.9325301204819277
[1m Performance report of [0m [92mcharacters[0m [91m1-gram[0m and [91m2-gram[0m and [91m3-gram[0m with [94mLinearSVC[0m
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       207
           1       0.95      0.98      0.96       208

   micro avg       0.96      0.96      0.96       415
   macro avg       0.96      0.96      0.96       415
weighted avg       0.96      0.96      0.96       415

0.9614457831325302
[1m Performance report of [0m [92mcharacters[0m [91m2-gram

In [59]:
slow_classifiers_list=[DecisionTreeClassifier,GaussianNB,MLPClassifier]
for clasifier in slow_classifiers_list:
    for feature in tryfeatures:
        for i in range(1,4):
            run_experiment_w_features(X_train[:],X_test[:],Y_train[:],Y_test[:],feature,(i,i),clasifier)

[1m Performance report of [0m [92mtokenize[0m [91m1-gram[0m with [94mDecisionTreeClassifier[0m
              precision    recall  f1-score   support

           0       0.83      0.82      0.82       207
           1       0.82      0.83      0.83       208

   micro avg       0.82      0.82      0.82       415
   macro avg       0.82      0.82      0.82       415
weighted avg       0.82      0.82      0.82       415

0.8240963855421687
[1m Performance report of [0m [92mtokenize[0m [91m2-gram[0m with [94mDecisionTreeClassifier[0m
              precision    recall  f1-score   support

           0       0.76      0.82      0.79       207
           1       0.80      0.75      0.78       208

   micro avg       0.78      0.78      0.78       415
   macro avg       0.78      0.78      0.78       415
weighted avg       0.78      0.78      0.78       415

0.7831325301204819
[1m Performance report of [0m [92mtokenize[0m [91m3-gram[0m with [94mDecisionTreeClassifier[0m

In [None]:
myfile=open("results/4-(BBC-CNN).txt")
lines=myfile.readlines()
myfile.close()

In [None]:
print(*lines[9:18])
print(*lines[18:27])
print(lines[18].strip()[23:].split(" ")[0])
print(lines[9].strip()[23:].split(" and "))
print(lines[9].strip()[23:].split(" ")[0])

In [None]:
for i in range(0,len(lines),9):
    one_line=lines[i:i+9]
    first_line=one_line[0].strip()[23:]
    ngram=[]
    ngrams=first_line.split(" and ")
    if len(ngrams)==2:
        clasifier=first_line.split("with")[1].strip()
        ngram.append(ngrams[0][-6:])
        ngram.append(ngrams[1][:6])
        first_line=first_line.split(" ")

    elif len(ngrams)==3:
        clasifier=first_line.split("with")[1].strip()
        ngram.append(ngrams[0][-6:])
        ngram.append(ngrams[1])
        ngram.append(ngrams[2][:6])
        first_line=first_line.split(" ")
    else:
        first_line=first_line.split(" ")
        ngram.append(first_line[1])
        clasifier=first_line[3]
    
    feature=first_line[0]
    precision=one_line[6].split(" ")[9]
    recall=one_line[6].split(" ")[15]
    f1score=one_line[6].split(" ")[21]
    accuray=one_line[8].strip()
    for i,k in enumerate(ngram):
        ngram[i]=k.split("-")[0]
#     +("%.2f" % round(float(accuray),2))
    if feature=="characters" and (ngram[0]=="2" ):
        print("\hline")
        print(classifiers_names[clasifier]+" & "+",".join(ngram)+"-gram"+" & "+precision+" & "+recall+" & "+f1score+" \\\\")
#         print(feature,ngram,clasifier,accuray)


In [None]:
\hline
Linear SVM (SVC) & 1-gram & 0.95 & 0.95 & 0.95 & 0.97 & 0.96 & 0.96 &  \\
\hline
Linear SVM (SVC) & 1,2-gram & 0.96 & 0.96 & 0.96 & 0.98 & 0.98 & 0.98 &  \\
\hline
Linear SVM (SVC) & 1,2,3-gram & 0.97 & 0.97 & 0.97 & 0.98 & 0.98 & 0.98 &  \\
\hline
Multinomial NB & 1-gram & 0.95 & 0.94 & 0.94 & 0.97 & 0.96 & 0.96 &  \\
\hline
Multinomial NB & 1,2-gram & 0.96 & 0.96 & 0.96 & 0.98 & 0.97 & 0.97 &  \\
\hline
Multinomial NB & 1,2,3-gram & 0.96 & 0.96 & 0.96 & 0.98 & 0.97 & 0.97 &  \\
\hline
Random Forest & 1-gram & 0.90 & 0.89 & 0.89 & 0.93 & 0.92 & 0.92 &  \\
\hline
Random Forest & 1,2-gram & 0.91 & 0.90 & 0.90 & 0.94 & 0.93 & 0.93 &  \\
\hline
Random Forest & 1,2,3-gram & 0.90 & 0.89 & 0.89 & 0.92 & 0.90 & 0.90 &  \\
\hline
AdaBoost & 1-gram &0.88 & 0.88 & 0.88 & 0.26 & 0.46 & 0.33 &  \\
\hline
AdaBoost & 1,2-gram &0.88 & 0.88 & 0.88 & 0.26 & 0.46 & 0.33 &  \\
\hline
AdaBoost & 1,2,3-gram &0.88 & 0.88 & 0.88 & 0.26 & 0.46 & 0.33 &  \\
\hline
Decision Tree & 1-gram &0.80 & 0.79 & 0.79 & 0.92 & 0.91 & 0.91 &  \\
\hline
Gaussian NB & 1-gram &0.89 & 0.88 & 0.88 &  0.98 & 0.97 & 0.97 &  \\
\hline
Multi-layer Perc. (MLP) & 1-gram &0.97 & 0.97 & 0.97 & 0.27 & 0.51 & 0.35 &  \\

In [None]:
list(range(0,10,9))

In [None]:
one_line=BBC_BBC_lines[9:9+9]

In [None]:
one_line[0].strip()[23:].split("and")

In [None]:
for line in lines:
    if "Forest" in line:
        print (line.split("and"))

In [None]:
classifiers_names={"LinearSVC":"Linear SVM (SVC)","MultinomialNB":"Multinomial NB","RandomForestClassifier":"Random Forest",
                  "AdaBoostClassifier":"AdaBoost","DecisionTreeClassifier":"Decision Tree","GaussianNB":"Gaussian NB","MLPClassifier":"Multi-layer Perc. (MLP)"}

In [None]:
from nltk.corpus import stopwords
