## INF8245E (Fall 2021) : Machine Learning - Assignment 3
## Amine EL AMERI - Matricule: 2164634

In [1]:
# this notebook has only been tested on jupyter locally

In [2]:
import numpy as np
from numpy.linalg import inv, pinv
import scipy as sp
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [25, 6]
import pandas as pd
from scipy.stats import mode
import sys
import copy
import random
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB

## Medical Text Classification

In [3]:
myPath = "medical_dataset/"
from os import listdir
from os.path import isfile, join
files = [(myPath + f) for f in listdir(myPath) if isfile(join(myPath, f))]
files

['medical_dataset/test.csv',
 'medical_dataset/train.csv',
 'medical_dataset/valid.csv']

##### Q1

In [4]:
train_raw = pd.read_csv('medical_dataset/train.csv', header=None)
train_raw = train_raw.values.tolist()[1:]

valid_raw = pd.read_csv('medical_dataset/valid.csv', header=None)
valid_raw = valid_raw.values.tolist()[1:]

test_raw = pd.read_csv('medical_dataset/test.csv', header=None)
test_raw = test_raw.values.tolist()[1:]

###### bag-of-words representations

In [5]:
def bow_pre_processing(data_raw, is_training_set):
      
    nbr_transcripts = len(data_raw)
    data_processed = copy.deepcopy(data_raw)
    all_words = {}
    transcripts_words = []
    
    for i in range(nbr_transcripts):
        
        words = {}
        
        print("\rTranscript number {}/{}.".format(i+1, nbr_transcripts), end="")
        sys.stdout.flush()
        
        transcript = data_raw[i][1]
        last_space = -1
        
        for letter_id in range(len(transcript)):

            if transcript[letter_id] in {",", ".", "'", ":", ";", "?", "!", "-", "+", "*", "/", "=", "[", "]", "(", ")", "{", "}", " ", "\t", "\n"}:
                word = transcript[last_space+1: letter_id].lower().strip()
                last_space = letter_id
                if word != "":
                    if word in words.keys():
                        words[word] += 1
                    else:
                        words[word] = 1
                        
                    if word in all_words.keys():
                        all_words[word] += 1
                    else:
                        all_words[word] = 1

        word = transcript[last_space+1:-1].lower().strip()
        if word != "":
            if word in words.keys():
                words[word] += 1
            else:
                words[word] = 1
            
            if word in all_words.keys():
                all_words[word] += 1
            else:
                all_words[word] = 1
                
        transcripts_words.append(words)
        data_processed[i][1] = list(words.keys())

    if is_training_set:
        sorted_words = sorted(all_words.items(), key=lambda kv: kv[1], reverse=True)
        vocab_words = [el[0] for el in sorted_words][0:10000]
        return (data_processed, vocab_words, sorted_words, transcripts_words) 
    else:
        return data_processed, transcripts_words

In [6]:
# pre-process training data
train_processed, vocab_words, train_words_dict, transcripts_words_train = bow_pre_processing(train_raw, is_training_set=True)

Transcript number 4000/4000.

In [7]:
# pre-process validation data
valid_processed, transcripts_words_valid = bow_pre_processing(valid_raw, is_training_set=False)

Transcript number 499/499.

In [8]:
# pre-process testing data
test_processed, transcripts_words_test = bow_pre_processing(test_raw, is_training_set=False)

Transcript number 500/500.

###### saving vocab data

In [9]:
# save medical_text-vocab.txt
data_tosave_vocab = [ (train_words_dict[idx][0] + "\t" + str(idx+1) + "\t" + str(train_words_dict[idx][1]) + "\n") for idx in range(10000)]
with open("medical_text-vocab.txt", "w") as output:
    for line in data_tosave_vocab:
        output.write(line)

###### binary bag-of-words

In [10]:
def bbow(transcripts_words, vocab_words):
    
    transcripts_bbow = []
    nbr_transcripts = len(transcripts_words)
    
    for transcript_id in range(nbr_transcripts):
        
        print("\rTranscript number {}/{}.".format(transcript_id+1, nbr_transcripts), end="")
        sys.stdout.flush()
        
        dict_vocab = dict.fromkeys(vocab_words, 0)
        for word in transcripts_words[transcript_id].keys():
            if word in dict_vocab.keys():
                dict_vocab[word] = 1
        

        transcripts_bbow.append(dict_vocab)

    return transcripts_bbow

In [11]:
transcripts_bbow_train = bbow(transcripts_words_train, vocab_words)

Transcript number 4000/4000.

In [12]:
transcripts_bbow_valid = bbow(transcripts_words_valid, vocab_words)

Transcript number 499/499.

In [13]:
transcripts_bbow_test = bbow(transcripts_words_test, vocab_words)

Transcript number 500/500.

###### frequency bag-of-words

In [14]:
def fbow(transcripts_words, vocab_words):
    
    transcripts_fbow = []
    nbr_transcripts = len(transcripts_words)
    
    for transcript_id in range(nbr_transcripts):
        
        print("\rTranscript number {}/{}.".format(transcript_id+1, nbr_transcripts), end="")
        sys.stdout.flush()
        
        dict_vocab = dict.fromkeys(vocab_words, 0)
        for word in transcripts_words[transcript_id].keys():
            if word in dict_vocab.keys():
                dict_vocab[word] += transcripts_words[transcript_id][word]
        
        sum_occ = sum(dict_vocab.values())
        for key in dict_vocab.keys():
            dict_vocab[key] = dict_vocab[key]/sum_occ
        transcripts_fbow.append(dict_vocab)

    return transcripts_fbow

In [15]:
transcripts_fbow_train = fbow(transcripts_words_train, vocab_words)

Transcript number 4000/4000.

In [16]:
transcripts_fbow_valid = fbow(transcripts_words_valid, vocab_words)

Transcript number 499/499.

In [17]:
transcripts_fbow_test = fbow(transcripts_words_test, vocab_words)

Transcript number 500/500.

###### dataset submission train, valid and test

In [18]:
def dataset_subm(data_processed, vocab_words, filename):
        
    dict_vocab = dict.fromkeys(vocab_words, 0)
    nbr_transcripts = len(data_processed)
    
    with open(filename, "w") as output:
    
        for transcript_id in range(nbr_transcripts):

            print("\rTranscript number {}/{}.".format(transcript_id+1, nbr_transcripts), end="")
            sys.stdout.flush()

            tmp = ""
            for word_id in range(len(data_processed[transcript_id][1])):
                if data_processed[transcript_id][1][word_id] in dict_vocab.keys():
                    tmp += str(list(dict_vocab.keys()).index(data_processed[transcript_id][1][word_id])).strip() + " "
                
                if word_id == len(data_processed[transcript_id][1])-1:
                    tmp = tmp.rstrip() + "\t" + data_processed[transcript_id][0] + "\n"

            output.write(tmp)

In [19]:
dataset_subm(train_processed, vocab_words, "medical_text-train.txt")

Transcript number 4000/4000.

In [20]:
dataset_subm(valid_processed, vocab_words, "medical_text-valid.txt")

Transcript number 499/499.

In [21]:
dataset_subm(test_processed, vocab_words, "medical_text-test.txt")

Transcript number 500/500.

##### Q2

###### Q2-a

In [22]:
def random_classifier(data_processed):
    predictions = []
    for i in range(len(data_processed)):
        lbl = random.randint(1,4)
        predictions.append(lbl)
    return predictions

In [23]:
def majority_class_classifier(train_processed, data_processed):

    majority_class = np.argmax([[train_processed[i][0] for i in range(len(train_processed))].count(j) for j in ["1", "2", "3", "4"]])+1
    predictions = [majority_class for i in range(len(data_processed))]

    return predictions

In [24]:
# F1 score of the random classifier

y_pred = random_classifier(train_processed)
y_true = [int(train_processed[i][0]) for i in range(len(train_processed))]

f1_score_random_classifier = f1_score(y_true, y_pred, average='macro')
print(f"the F1 score of the random classifier is {f1_score_random_classifier}")

the F1 score of the random classifier is 0.2610138420043496


In [25]:
# F1 score of the majority class classifier

y_pred = majority_class_classifier(train_processed, train_processed)
y_true = [int(train_processed[i][0]) for i in range(len(train_processed))]

f1_score_majority_class_classifier = f1_score(y_true, y_pred, average='macro')
print(f"the F1 score of the majority classifier is {f1_score_majority_class_classifier}")

the F1 score of the majority classifier is 0.120996778472617


###### Q2-b

In [26]:
bbow_train_good_format = np.array([list(transcripts_bbow_train[i].values()) for i in range(len(transcripts_bbow_train))])
bbow_valid_good_format = np.array([list(transcripts_bbow_valid[i].values()) for i in range(len(transcripts_bbow_valid))])
bbow_test_good_format = np.array([list(transcripts_bbow_test[i].values()) for i in range(len(transcripts_bbow_test))])

In [27]:
y_train_good_format = [int(train_processed[i][0]) for i in range(len(train_processed))]
y_valid_good_format = [int(valid_processed[i][0]) for i in range(len(valid_processed))]
y_test_good_format = [int(test_processed[i][0]) for i in range(len(test_processed))]

In [28]:
trainAndValid = np.concatenate((bbow_train_good_format, bbow_valid_good_format), axis=0)
y_trainAndValid = np.concatenate((y_train_good_format, y_valid_good_format))
split_index = np.concatenate((-np.ones(bbow_train_good_format.shape[0]), np.zeros(bbow_valid_good_format.shape[0])), axis=0)
predef_split = PredefinedSplit(test_fold = split_index)

###### Bernoulli Naive Bayes

In [29]:
smooth_params = {'alpha': np.arange(0, 1.05, 0.05)}

bnb_clf = GridSearchCV(estimator = BernoulliNB(), 
                       param_grid = smooth_params, 
                       scoring = 'f1_macro', 
                       cv = predef_split)

bnb_clf.fit(trainAndValid, y_trainAndValid)
print(f"The best BernoulliNB classifier is {bnb_clf.best_estimator_}")



The best BernoulliNB classifier is BernoulliNB(alpha=0.65)


In [30]:
bnb = BernoulliNB(alpha = 0.65, binarize = None, fit_prior = True)

bnb.fit(bbow_train_good_format, y_train_good_format)
y_pred_train = bnb.predict(bbow_train_good_format)
y_pred_valid = bnb.predict(bbow_valid_good_format)
y_pred_test = bnb.predict(bbow_test_good_format)

print(f"BernoulliNB train f1_score with best param: {f1_score(y_train_good_format, y_pred_train, average='macro')}")
print(f"BernoulliNB valid f1_score with best param: {f1_score(y_valid_good_format, y_pred_valid, average='macro')}")
print(f"BernoulliNB test f1_score with best param: {f1_score(y_test_good_format, y_pred_test, average='macro')}")

BernoulliNB train f1_score with best param: 0.537771413178238
BernoulliNB valid f1_score with best param: 0.46002458735798557
BernoulliNB test f1_score with best param: 0.4703643259118477


###### Decision Trees

In [31]:
params_to_test = {'criterion': ["gini", "entropy"], 'max_depth': np.arange(10, 80, 20), 'min_samples_split': np.arange(0.1, 1, 0.2)}

tree_clf = GridSearchCV(estimator = DecisionTreeClassifier(), 
                        param_grid = params_to_test, 
                        scoring = 'f1_macro', 
                        cv = predef_split)

tree_clf.fit(trainAndValid, y_trainAndValid)
print(f"the best decision tree classifier parameters are: {tree_clf.best_params_}")

the best decision tree classifier parameters are: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 0.1}


In [32]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=30, min_samples_split=0.1)
tree.fit(bbow_train_good_format, y_train_good_format)

y_pred_train = tree.predict(bbow_train_good_format)
y_pred_valid = tree.predict(bbow_valid_good_format)
y_pred_test = tree.predict(bbow_test_good_format)

print(f"DecisionTreeClassifier train f1_score with best param: {f1_score(y_train_good_format, y_pred_train, average='macro')}")
print(f"DecisionTreeClassifier valid f1_score with best param: {f1_score(y_valid_good_format, y_pred_valid, average='macro')}")
print(f"DecisionTreeClassifier test f1_score with best param: {f1_score(y_test_good_format, y_pred_test, average='macro')}")

DecisionTreeClassifier train f1_score with best param: 0.7313573608654307
DecisionTreeClassifier valid f1_score with best param: 0.7097091502926711
DecisionTreeClassifier test f1_score with best param: 0.7118875760543775


###### Logistic regression

In [33]:
params_to_test = {'penalty': ['l1','l2'], 'C': [0.01, 0.05, 0.1, 1, 5, 10, 50]}

log_reg_clf = GridSearchCV(estimator = LogisticRegression(solver='liblinear'), 
                           param_grid = params_to_test, 
                           scoring = 'f1_macro', 
                           cv = predef_split)

log_reg_clf.fit(trainAndValid, y_trainAndValid)
print(f"the best logistic regression classifier parameters are: {log_reg_clf.best_params_}")

the best logistic regression classifier parameters are: {'C': 0.1, 'penalty': 'l1'}


In [34]:
log_reg = LogisticRegression(solver='liblinear', penalty='l1', C=0.1)
log_reg.fit(bbow_train_good_format, y_train_good_format)

y_pred_train = log_reg.predict(bbow_train_good_format)
y_pred_valid = log_reg.predict(bbow_valid_good_format)
y_pred_test = log_reg.predict(bbow_test_good_format)

print(f"LogisticRegression train f1_score with best param: {f1_score(y_train_good_format, y_pred_train, average='macro')}")
print(f"LogisticRegression valid f1_score with best param: {f1_score(y_valid_good_format, y_pred_valid, average='macro')}")
print(f"LogisticRegression test f1_score with best param: {f1_score(y_test_good_format, y_pred_test, average='macro')}")

LogisticRegression train f1_score with best param: 0.8238065644344815
LogisticRegression valid f1_score with best param: 0.7993613135811319
LogisticRegression test f1_score with best param: 0.8172132253711201


###### Linear SVM

In [35]:
params_to_test = {'C': np.arange(0.001, 0.1, 0.001)}

linear_SVM_clf = GridSearchCV(estimator = LinearSVC(random_state=0, max_iter=6000), 
                              param_grid = params_to_test, 
                              scoring = 'f1_macro', 
                              cv = predef_split)

linear_SVM_clf.fit(trainAndValid, y_trainAndValid)
print(f"the best linear SVM classifier parameters are: {linear_SVM_clf.best_params_}")

the best linear SVM classifier parameters are: {'C': 0.093}


In [36]:
linearSVM = LinearSVC(random_state=0, max_iter=6000, C = 0.093)
linearSVM.fit(bbow_train_good_format, y_train_good_format)

y_pred_train = linearSVM.predict(bbow_train_good_format)
y_pred_valid = linearSVM.predict(bbow_valid_good_format)
y_pred_test = linearSVM.predict(bbow_test_good_format)

print(f"LinearSVC train f1_score with best param: {f1_score(y_train_good_format, y_pred_train, average='macro')}")
print(f"LinearSVC valid f1_score with best param: {f1_score(y_valid_good_format, y_pred_valid, average='macro')}")
print(f"LinearSVC test f1_score with best param: {f1_score(y_test_good_format, y_pred_test, average='macro')}")

LinearSVC train f1_score with best param: 0.9075425680813174
LinearSVC valid f1_score with best param: 0.7383026509927384
LinearSVC test f1_score with best param: 0.7837180724102886


##### Q3

In [37]:
fbow_train_good_format = np.array([list(transcripts_fbow_train[i].values()) for i in range(len(transcripts_fbow_train))])
fbow_valid_good_format = np.array([list(transcripts_fbow_valid[i].values()) for i in range(len(transcripts_fbow_valid))])
fbow_test_good_format = np.array([list(transcripts_fbow_test[i].values()) for i in range(len(transcripts_fbow_test))])

In [38]:
trainAndValid = np.concatenate((fbow_train_good_format, fbow_valid_good_format), axis=0)
y_trainAndValid = np.concatenate((y_train_good_format, y_valid_good_format))
split_index = np.concatenate((-np.ones(fbow_train_good_format.shape[0]), np.zeros(fbow_valid_good_format.shape[0])), axis=0)
predef_split = PredefinedSplit(test_fold = split_index)

###### Gaussian Naive Bayes

In [39]:
gnb = GaussianNB()

gnb.fit(fbow_train_good_format, y_train_good_format)
y_pred_train = gnb.predict(fbow_train_good_format)
y_pred_valid = gnb.predict(fbow_valid_good_format)
y_pred_test = gnb.predict(fbow_test_good_format)

print(f"GaussianNB train f1_score: {f1_score(y_train_good_format, y_pred_train, average='macro')}")
print(f"GaussianNB valid f1_score: {f1_score(y_valid_good_format, y_pred_valid, average='macro')}")
print(f"GaussianNB test f1_score: {f1_score(y_test_good_format, y_pred_test, average='macro')}")

GaussianNB train f1_score: 0.6904713749154733
GaussianNB valid f1_score: 0.3638831094505836
GaussianNB test f1_score: 0.352960329522165


###### Decision Trees

In [40]:
params_to_test = {'criterion': ["gini", "entropy"], 'max_depth': np.arange(10, 80, 20), 'min_samples_split': np.arange(0.1, 1, 0.2)}

tree_clf = GridSearchCV(estimator = DecisionTreeClassifier(), 
                        param_grid = params_to_test, 
                        scoring = 'f1_macro', 
                        cv = predef_split)

tree_clf.fit(trainAndValid, y_trainAndValid)
print(f"the best decision tree classifier parameters are: {tree_clf.best_params_}")

the best decision tree classifier parameters are: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 0.1}


In [41]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=30, min_samples_split=0.1)
tree.fit(fbow_train_good_format, y_train_good_format)

y_pred_train = tree.predict(fbow_train_good_format)
y_pred_valid = tree.predict(fbow_valid_good_format)
y_pred_test = tree.predict(fbow_test_good_format)

print(f"DecisionTreeClassifier train f1_score with best param: {f1_score(y_train_good_format, y_pred_train, average='macro')}")
print(f"DecisionTreeClassifier valid f1_score with best param: {f1_score(y_valid_good_format, y_pred_valid, average='macro')}")
print(f"DecisionTreeClassifier test f1_score with best param: {f1_score(y_test_good_format, y_pred_test, average='macro')}")

DecisionTreeClassifier train f1_score with best param: 0.7328245647524696
DecisionTreeClassifier valid f1_score with best param: 0.7216274221878163
DecisionTreeClassifier test f1_score with best param: 0.7090111767169525


###### Logistic regression

In [42]:
params_to_test = {'penalty': ['l1','l2'], 'C': [0.01, 0.05, 0.1, 1, 5, 10, 50]}

log_reg_clf = GridSearchCV(estimator = LogisticRegression(solver='liblinear'), 
                           param_grid = params_to_test, 
                           scoring = 'f1_macro', 
                           cv = predef_split)

log_reg_clf.fit(trainAndValid, y_trainAndValid)
print(f"the best logistic regression classifier parameters are: {log_reg_clf.best_params_}")

the best logistic regression classifier parameters are: {'C': 50, 'penalty': 'l1'}


In [43]:
log_reg = LogisticRegression(solver='liblinear', penalty='l1', C=50)
log_reg.fit(fbow_train_good_format, y_train_good_format)

y_pred_train = log_reg.predict(fbow_train_good_format)
y_pred_valid = log_reg.predict(fbow_valid_good_format)
y_pred_test = log_reg.predict(fbow_test_good_format)

print(f"LogisticRegression train f1_score with best param: {f1_score(y_train_good_format, y_pred_train, average='macro')}")
print(f"LogisticRegression valid f1_score with best param: {f1_score(y_valid_good_format, y_pred_valid, average='macro')}")
print(f"LogisticRegression test f1_score with best param: {f1_score(y_test_good_format, y_pred_test, average='macro')}")

LogisticRegression train f1_score with best param: 0.8388700393491383
LogisticRegression valid f1_score with best param: 0.760287094912295
LogisticRegression test f1_score with best param: 0.7653840822497652


###### Linear SVM

In [44]:
params_to_test = {'C': np.arange(0.001, 0.1, 0.001)}

linear_SVM_clf = GridSearchCV(estimator = LinearSVC(random_state=0, max_iter=6000), 
                              param_grid = params_to_test, 
                              scoring = 'f1_macro', 
                              cv = predef_split)

linear_SVM_clf.fit(trainAndValid, y_trainAndValid)
print(f"the best linear SVM classifier parameters are: {linear_SVM_clf.best_params_}")

the best linear SVM classifier parameters are: {'C': 0.089}


In [45]:
linearSVM = LinearSVC(random_state=0, max_iter=6000, C = 0.089)
linearSVM.fit(fbow_train_good_format, y_train_good_format)

y_pred_train = linearSVM.predict(fbow_train_good_format)
y_pred_valid = linearSVM.predict(fbow_valid_good_format)
y_pred_test = linearSVM.predict(fbow_test_good_format)

print(f"LinearSVC train f1_score with best param: {f1_score(y_train_good_format, y_pred_train, average='macro')}")
print(f"LinearSVC valid f1_score with best param: {f1_score(y_valid_good_format, y_pred_valid, average='macro')}")
print(f"LinearSVC test f1_score with best param: {f1_score(y_test_good_format, y_pred_test, average='macro')}")

LinearSVC train f1_score with best param: 0.31937742267309815
LinearSVC valid f1_score with best param: 0.3179102706124387
LinearSVC test f1_score with best param: 0.3129974362578737
