# **Exercise 0: Preprocessing Text Data on 20newsgroups Dataset**


1.   Remove punctuation, stop-words
2.   Bag-of-words feature representation
3.   TF-IDF feature representation
4.   Split the dataset randomly into train/validation/test splits 80%-10%-10%

In [None]:
import numpy as np
import pandas as pd
import os
import operator
from nltk.corpus import stopwords
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
import math
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.model_selection import GridSearchCV
np.random.seed(3116)

In [None]:
path20 = 'C:/Users/user/20_newsgroups/'

In [None]:
foldernames=sorted(os.listdir(os.path.join('C:/Users/user/20_newsgroups/')))
categories = [13,1]    #index of required folder names
foldernames = [foldernames[x] for x in categories]
print(foldernames)

['sci.med', 'comp.graphics']


In [None]:
data={}           #dict of foldernames (keys) -> f -> docs
for f in foldernames:
    data[f]=[]
    for docs in os.listdir(os.path.join(path20,f)):
        with open(os.path.join(path20,f,docs),encoding='latin-1') as doc_open:
            data[f].append(doc_open.read())

for i in range(len(data)):
    print("length of data",i,len(data[foldernames[i]]))

length of data 0 1000
length of data 1 1000


## *Remove punctuation, stop-words*

In [None]:
punctuation_list = list(punctuation)
stop_words=stopwords.words('english')
stop_words+=punctuation_list

# Additinal stopwords as common and unnecessary words among documents
stop_words += ['newsgroups:','sci.med','computer','graphics','comp.graphics','subject:','from:', 'lines:', 'path:', 'organization:', 
            'date:','would', 'writes:', 'references:','message-id:', 'article', 'sender:', 'nntp-posting-host:', 'people', 
            'university', 'think', 'xref:', 'cantaloupe.srv.cs.cmu.edu',  'could', 'distribution:', 'first', 
            'anyone', 'really', 'since', 'still', "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'"]

## *Bag-of-words feature representation*

In [None]:
words_all={}                 #Bag of Words     (word and count dict)
for i in range(len(data)):
    for text in data[foldernames[i]]:
        for w in text.split(): #words splitted in text document
            if w.lower() not in stop_words and len(w.lower()) >= 4 and w.isalpha(): #conditions to considered as word
                if w.lower() not in words_all:
                    words_all[w.lower()]=1
                else:
                    words_all[w.lower()]+=1

print(len(words_all))

18595


In [None]:
words_all_sorted = sorted(words_all.items(),key=lambda t: t[1], reverse = True)   #7040 words appear only 1 time
print(words_all_sorted[:5])

[('image', 1040), ('like', 886), ('also', 871), ('know', 821), ('many', 604)]


In [None]:
word_list = []   #feature list
#word_count = []
for word, count in sorted(words_all.items(), key=lambda t: t[1], reverse=True):
    word_list.append(word)

print("Most common 5 words among documents are: ",word_list[:5])

Most common 5 words among documents are:  ['image', 'like', 'also', 'know', 'many']


## *TF-IDF feature representation*

In [None]:
data1 = data[foldernames[0]] + data[foldernames[1]]

In [None]:
data[foldernames[1]][1] == data1[1001]

True

In [None]:
tf_idf_vector = TfidfVectorizer(stop_words=stop_words, vocabulary=word_list)
tf_idfX = tf_idf_vector.fit_transform(data1)
tf_idfX.shape

  'stop_words.' % sorted(inconsistent))


(2000, 18595)

In [None]:
tfidf_words = pd.DataFrame(tf_idfX.toarray(), columns=word_list)

In [None]:
tfidf_words

Unnamed: 0,image,like,also,know,many,file,jpeg,available,medical,information,...,skylane,polyon,martens,ralcgm,gplot,allegories,hide,ginsberg,zheng,decor
0,0.000000,0.000000,0.000000,0.034327,0.000000,0.000000,0.0,0.052356,0.0,0.045559,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.006198,0.000000,0.011959,0.000000,0.000000,0.0,0.063838,0.0,0.007936,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.046641,0.000000,0.014999,0.019674,0.000000,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,...,0.115359,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1996,0.000000,0.000000,0.000000,0.029624,0.000000,0.095283,0.0,0.000000,0.0,0.000000,...,0.000000,0.105381,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1997,0.083751,0.000000,0.000000,0.058895,0.000000,0.000000,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1998,0.000000,0.000000,0.070188,0.030908,0.000000,0.000000,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.115889,0.317195,0.115889,0.115889,0.115889,0.109947,0.000000,0.000000


## *Split the dataset randomly into train/validation/test splits 80%-10%-10*

In [None]:
X=tfidf_words.values
X *= 100  #work wth integers
print(X)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.61980754  0.         ...  0.          0.
   0.        ]
 ...
 [ 8.37509113  0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          7.01884416 ... 10.99472469  0.
   0.        ]
 [ 0.          0.          0.         ...  0.         24.34977875
  24.34977875]]


In [None]:
Y=[] # target newsgroup 
for i in range(len(data)):
    for doc in data[foldernames[i]]:
        Y.append(foldernames[i])
Y=np.array(Y)

In [None]:
print("X dimension:", np.shape(X))
print("Y dimension:", np.shape(Y))

X dimension: (2000, 18595)
Y dimension: (2000,)


In [None]:
X, Y = shuffle(np.array(X), np.array(Y), random_state=3116)   #shuffle dataset

In [None]:
def train(data,fraction1,fraction2): #To split dataset into train (80%), val (10%), test (10%)
    n = len(data)
    c=math.ceil((fraction1)*n)
    d=math.ceil((fraction2)*n)
    train = data[:c]
    return train

def val(data,fraction1,fraction2): #Validation data
    n = len(data)
    c=math.ceil((fraction1)*n)
    d=math.ceil((fraction2)*n)
    val = data[c:d]
    return val

def test(data,fraction1,fraction2): #Test data
    n = len(data)
    c=math.ceil((fraction1)*n)
    d=math.ceil((fraction2)*n)
    test = data[d:n]
    return test

In [None]:
trainX = train(X,0.8,0.9)
trainY = train(Y,0.8,0.9)
valX = val(X,0.8,0.9)
valY = val(Y,0.8,0.9)
testX = test(X,0.8,0.9)
testY = test(Y,0.8,0.9)

In [None]:
print("Train:", "X:", np.shape(trainX), "Y:",np.shape(trainY))
print("Validation:", "X:", np.shape(valX), "Y:", np.shape(valY))
print("Test:", "X:", np.shape(testX), "Y:", np.shape(testY))

Train: X: (1600, 18595) Y: (1600,)
Validation: X: (200, 18595) Y: (200,)
Test: X: (200, 18595) Y: (200,)


# **Exercise 1: Implementing Naive Bayes Classifier for Text Data to categorize news items**

In [None]:
def fit_nb(dataX, dataY):
    output={}
    output["data_total"]=len(dataY)
    labels=set(dataY)
    for label_curr in labels:    #class label
        output[label_curr]={}
        current_rows=(dataY==label_curr)
        dataX_curr=dataX[current_rows]
        word_count=0
        for i in range(len(word_list)):
            output[label_curr][word_list[i]]=dataX_curr[:,i].sum()
            word_count+=dataX_curr[:,i].sum()
        output[label_curr]["word_total"]=word_count
    return output

In [None]:
def log_prob(x, dict_train, class_curr):
    output=np.log(dict_train[class_curr]["word_total"])-np.log(dict_train["data_total"])
    for i in range(len(word_list)):
        current_word_count=dict_train[class_curr][word_list[i]]+1
        total_word_count=dict_train[class_curr]["word_total"]+len(word_list)
        current_word_probability=np.log(current_word_count)-np.log(total_word_count)
        for j in range(int(x[i])):
            output+=current_word_probability
    return output


def pred_doc(x, dict_train):
    class_best=-1
    prob_best=-1
    class_all=dict_train.keys()
    cond_exc=True
    for class_curr in class_all:
        if class_curr=="data_total":
            continue
        prob_class_curr=log_prob(x,dict_train,class_curr)
        if(cond_exc==True or prob_class_curr>prob_best):
            class_best=class_curr
            prob_best=prob_class_curr
        cond_exc=False
    return class_best


def pred_target(testX, dict_train):
    pred_y=[]
    num = 0
    for x in testX:
        pred_y.append(pred_doc(x, dict_train))
    return pred_y


def score(pred_y, trueY):      #mean accuracy
    count = 0
    for i in range(len(pred_y)):
        if pred_y[i] == trueY[i]:
            count+=1
    return count/len(pred_y)

In [None]:
predictions_dict=fit_nb(trainX,trainY)

In [None]:
predY=pred_target(testX,predictions_dict)

In [None]:
test_accuracy = score(predY, testY)
print("Bayesian test_accuracy=", test_accuracy)

Bayesian test_accuracy= 0.98


# **Exercise 2: Implementing SVM Classifier via Scikit-Learn by tuning the different SVM kernel**

##### Different SVM kernel choices are: 'rbf','poly','sigmoid','linear' within SVM classification choises of SVC, NuSVC and LinearSVC

## SVC

In [None]:
#Hyperparameter selections
C_list = [1, 10, 20, 50]  #Regularization parameter
kernels_list = ['rbf','poly','sigmoid','linear']    #SVC kernels
gamma_list = ['scale', 'auto']
decisionfs_list = ['ovo', 'ovr']

In [None]:
hyperparameter_list = pd.DataFrame(columns=["C", "decision_function_shape", "gamma", "kernel"])
val_scores = []

In [None]:
for k in kernels_list:
    svc_lib = SVC()
    parameters={'C': C_list, 'kernel': [k], 'gamma': gamma_list , 'decision_function_shape': decisionfs_list}
    
    grid_search = GridSearchCV(svc_lib, parameters, n_jobs = -1, cv = 3)
    grid_search.fit(valX,valY)    
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    
    hyperparameter_list = hyperparameter_list.append(grid_search.best_params_, ignore_index=True)
    val_scores.append(grid_search.best_score_)

{'C': 10, 'decision_function_shape': 'ovo', 'gamma': 'auto', 'kernel': 'rbf'}
0.94
{'C': 10, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'poly'}
0.63
{'C': 10, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'sigmoid'}
0.945
{'C': 1, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'linear'}
0.945


In [None]:
hyperparameter_list["val_scores"] = val_scores
hyperparameter_list

Unnamed: 0,C,decision_function_shape,gamma,kernel,val_scores
0,10,ovo,auto,rbf,0.94
1,10,ovo,scale,poly,0.63
2,10,ovo,scale,sigmoid,0.945
3,1,ovo,scale,linear,0.945


In [None]:
hyperparameter_list = hyperparameter_list.drop(['val_scores'], axis=1)

In [None]:
print("Test Accuracy Scores given SVC kernels with their best hyperparamaters")
test_accuracy_scores = []

for i in range(len(hyperparameter_list)):
    print("For Kernel=",hyperparameter_list.iloc[i][3])    #SVM kernel choice
    clf = SVC(C=hyperparameter_list.iloc[i][0],decision_function_shape=hyperparameter_list.iloc[i][1],gamma=hyperparameter_list.iloc[i][2],kernel=hyperparameter_list.iloc[i][3])
    clf.fit(trainX, trainY)
    acc = clf.score(testX,testY)      #report test accuracy
    print("Test accuracy is=", acc)

Test Accuracy Scores given SVC kernels with their best hyperparamaters
For Kernel= rbf
Test accuracy is= 0.975
For Kernel= poly
Test accuracy is= 0.87
For Kernel= sigmoid
Test accuracy is= 0.975
For Kernel= linear
Test accuracy is= 0.98


In [None]:
#Looking at the best accuracy score in gridsearch: best hyperparameter option is as follows /Extra

In [None]:
hyperparameter_list["val_scores"] = val_scores
hyperparameter_list

Unnamed: 0,C,decision_function_shape,gamma,kernel,val_scores
0,10,ovo,auto,rbf,0.94
1,10,ovo,scale,poly,0.63
2,10,ovo,scale,sigmoid,0.945
3,1,ovo,scale,linear,0.945


In [None]:
params_best=hyperparameter_list.iloc[np.argmax(hyperparameter_list.val_scores),:] #find best hyperparameters according to highest accuracy
print("Best hyperparameters:\n", params_best)

Best hyperparameters:
 C                               10
decision_function_shape        ovo
gamma                        scale
kernel                     sigmoid
val_scores                   0.945
Name: 2, dtype: object


In [None]:
#Fitting these best hyperparameters to SVM classifier

In [None]:
clf = SVC(C=params_best[0],decision_function_shape=params_best[1],gamma=params_best[2],kernel=params_best[3])
clf.fit(trainX, trainY)   #train data to fit model
best_hyp_score = clf.score(testX,testY)
print("Test data accuracy using best hyperparameters found from validation is= ",best_hyp_score)

Test data accuracy using best hyperparameters found from validation is=  0.975


## NuSVC

In [None]:
nu_list = [0.1, 0.5, 0.9]
kernels_list = ['rbf','poly','sigmoid','linear']    #SVC kernels
gamma_list = ['scale', 'auto']
decisionfs_list = ['ovo', 'ovr']

In [None]:
hyperparameter_list2 = pd.DataFrame(columns=["nu", "decision_function_shape", "gamma", "kernel"])
val_scores = []

In [None]:
for k in kernels_list:
    svc_lib = NuSVC()
    parameters={'nu': nu_list, 'kernel': [k], 'gamma': gamma_list , 'decision_function_shape': decisionfs_list}
    
    grid_search2 = GridSearchCV(svc_lib, parameters, n_jobs = -1, cv = 3)
    grid_search2.fit(valX,valY)    
    print(grid_search2.best_params_)
    print(grid_search2.best_score_)
    
    hyperparameter_list2 = hyperparameter_list2.append(grid_search2.best_params_, ignore_index=True)
    val_scores.append(grid_search2.best_score_)

{'decision_function_shape': 'ovo', 'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.1}
0.94
{'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'poly', 'nu': 0.1}
0.63
{'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'sigmoid', 'nu': 0.1}
0.945
{'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'linear', 'nu': 0.1}
0.945


In [None]:
hyperparameter_list2

Unnamed: 0,nu,decision_function_shape,gamma,kernel
0,0.1,ovo,auto,rbf
1,0.1,ovo,scale,poly
2,0.1,ovo,scale,sigmoid
3,0.1,ovo,scale,linear


In [None]:
print("Test Accuracy Scores given NuSVC kernels with their best hyperparamaters")
test_accuracy_scores = []

for i in range(len(hyperparameter_list2)):
    print("For Kernel=",hyperparameter_list2.iloc[i][3])    #SVM kernel choice
    clf = NuSVC(nu=hyperparameter_list2.iloc[i][0],decision_function_shape=hyperparameter_list2.iloc[i][1],gamma=hyperparameter_list2.iloc[i][2],kernel=hyperparameter_list2.iloc[i][3])
    clf.fit(trainX, trainY)
    acc = clf.score(testX,testY)      #report test accuracy
    print("Test accuracy is=", acc)

Test Accuracy Scores given NuSVC kernels with their best hyperparamaters
For Kernel= rbf
Test accuracy is= 0.975
For Kernel= poly
Test accuracy is= 0.87
For Kernel= sigmoid
Test accuracy is= 0.975
For Kernel= linear
Test accuracy is= 0.98


## LinearSVC

In [None]:
penalty_list = ['l2']     #l1 couldnt add because it is not supported with hinge loss
kernels_list = ['linear']    #Only Linear is available
loss_list = ['hinge','squared_hinge']
C_list = [1, 10, 20, 50]

In [None]:
hyperparameter_list3 = pd.DataFrame(columns=["penalty", "loss", "C"])
val_scores = []

In [None]:
svc_lib = LinearSVC()
parameters={'penalty': penalty_list, 'loss': loss_list , 'C': C_list}
    
grid_search3 = GridSearchCV(svc_lib, parameters, n_jobs = -1, cv = 3)
grid_search3.fit(valX,valY)    
print(grid_search3.best_params_)
print(grid_search3.best_score_)
    
hyperparameter_list3 = hyperparameter_list3.append(grid_search3.best_params_, ignore_index=True)
val_scores.append(grid_search3.best_score_)

{'C': 1, 'loss': 'hinge', 'penalty': 'l2'}
0.95


In [None]:
hyperparameter_list3

Unnamed: 0,penalty,loss,C
0,l2,hinge,1


In [None]:
print("Test Accuracy Scores given LinearSVC kernels with their best hyperparamaters")
test_accuracy_scores = []

for i in range(len(hyperparameter_list3)):
    print("For Kernel = Linear")    #LinearSVC only allows linear kernel
    clf = LinearSVC(penalty=hyperparameter_list3.iloc[i][0],loss=hyperparameter_list3.iloc[i][1], C=hyperparameter_list3.iloc[i][2])
    clf.fit(trainX, trainY)
    acc = clf.score(testX,testY)      #report test accuracy
    print("Test accuracy is=", acc)

Test Accuracy Scores given LinearSVC kernels with their best hyperparamaters
For Kernel = Linear
Test accuracy is= 0.975




##### Explanation: Given 3 types of SVM classifiers (SVC, NuSVC, LinearSVC), using kernels rbf, polynomial, sigmoid and linear gives similar results. Linear, sigmoid and rbf kernels performed well on validation and test accuracy, while polynomial kernel not performed that well. Fitting data is fastest in SVC. Note: LinearSVC only allowed linear kernel.

# References

###https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html
###https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#training-a-classifier
###https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76
###https://towardsdatascience.com/implementing-a-naive-bayes-classifier-for-text-categorization-in-five-steps-f9192cdd54c3
###https://github.com/jonhare/LloydsRegistryMachineLearningCourse/blob/master/Monday/ml101-tutorial/tutorial.md
###https://stackoverflow.com/questions/23289547/shuffle-two-list-at-once-with-same-order
###https://github.com/gokriznastic/20-newsgroups_text-classification/blob/master/Multinomial%20Naive%20Bayes-%20BOW%20with%20TF.ipynb
###https://scikit-learn.org/stable/modules/svm.html
###https://machinelearningmastery.com/scikit-optimize-for-hyperparameter-tuning-in-machine-learning/