# Introduction to NLP

##### TP1 done by Litoux Pierre, Arsenec Charles-André, Deplagne Hugo

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from string import punctuation
import functools
from collections import defaultdict
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# The dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("imdb")
dataset

### 1. How many splits does the dataset has?
The dataset has 3 splits (training, test, unsupervised).

### 2. How big are these splits?
train -> 25000 rows  
test -> 25000 rows  
unsupervised -> 50000 rows  
A row contains a text and a value 0 or 1 corresponding respectively to a negative sentiment or a positive one.

##### We will create dataframes to manipulate the data in each splits :

In [3]:
pd.set_option('display.max_colwidth', 100)
df_train = pd.DataFrame(dataset['train'])
df_train

In [4]:
df_test = pd.DataFrame(dataset['test'])
df_test

In [5]:
df_unsupervised = pd.DataFrame(dataset['unsupervised'])
df_unsupervised

#### 3. What is the proportion of each class on the supervised splits?

Here is a plot counting the number of each class into train and test splits :

In [6]:
fig, ax = plt.subplots(1, 2, figsize=(10, 3))
for i, df, df_str in zip([0, 1], [df_train, df_test], ['df_train', 'df_test']):
    ax[i].set_xticks([0, 1])
    ax[i].set_yticks([0, df['label'].value_counts().max()])
    ax[i].hist(df['label'])
    ax[i].set_xlabel('label')
    ax[i].set_ylabel('count')
    ax[i].set_title(df_str)

plt.show()

In [7]:
labels = df_train["label"].unique()
labels

The classes are either 0 or 1. Each classes count up for 12500 on each splits (test and train).

## Naive Bayes classifier


#### 1. Preprocessing

We will lower the text to not duplicate words that may be the same with capitals.  
Some texts contain some html like "<br \/>" or punctuations which are not necessary. <br> Let remove them except for '-' which may be useful to keep.

In [8]:
def preprocessingString(text: str) -> str:
    '''
        Preprocessing string
        Input:
            text: string
        Output:
            text: string
    '''
    text = text.lower().replace("<br />", " ")
    for punct in punctuation:
        if (not punct in str("-")):
            text = text.replace(punct, " ")
    return text

In [9]:
df_train['text'] = df_train['text'].apply(lambda text: preprocessingString(text))
df_train.head()

In [10]:
df_test['text'] = df_test['text'].apply(lambda text: preprocessingString(text))
df_test.head()

#### 2. Naive Bayes from Scratch

In [11]:
def trainNaiveBayes(df: pd.DataFrame, classes: list) -> tuple:
    '''
        Input:
            df: dataframe
            classes: list of classes
        Output:
            log_prior: list of log prior of each class
            loglikehood: list of log likehood of each class
            vocabulary: set of vocabulary
    '''
    log_prior = []
    loglikehood = []
    vocabulary = set(str().join((df['text'])).split(" "))
    n_doc = df.shape[0]
    
    for class_c in classes:
        n_class = df[df['label'] == class_c].shape[0]
        log_prior.append(np.log(n_class / n_doc))
        big_doc = str().join((df[df['label'] == class_c]['text'])).split(" ")
        
        d = defaultdict(int)
        for word in big_doc:
           d[word] += 1 
        sumcount_v = sum(d.values()) + len(vocabulary)
        
        loglikehood_c = {}
        for word in vocabulary:
            loglikehood_c[word] = np.log((d[word] + 1) / sumcount_v)
        loglikehood.append(loglikehood_c)
    
    return log_prior, loglikehood, vocabulary

In [12]:
logprior, loglikehood, vocabulary = trainNaiveBayes(df_train, labels)

In [13]:
def testNaiveBayes(testdoc: str, logprior: list, loglikehood: list, classes: list, vocabulary: set) -> int:
    '''
        Input:
            testdoc: string
            log_prior: list of log prior of each class
            loglikehood: list of log likehood of each class
            classes: list of classes
            vocabulary: set of vocabulary
        Output:
            class: class of testdoc
    '''
    probabilty_class = {}
    for classes_c in classes:
        probabilty_class[classes_c] = logprior[classes_c]
        for word in testdoc.split():
            if (word in vocabulary):
                probabilty_class[classes_c] += loglikehood[classes_c][word]
    return np.argmax(list(probabilty_class.values()))

In [14]:
y_pred = df_test['text'].apply(lambda text : testNaiveBayes(text, logprior, loglikehood, labels, vocabulary))
y_true = df_test['label']
accuracy = accuracy_score(y_true, y_pred)
accuracy

#### 3. Naive Bayes from Scikit-learn

In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

PipelineNB = Pipeline([('vect', CountVectorizer()),
                       ('clf', MultinomialNB())])
PipelineNB.fit(df_train['text'], df_train['label'])

y_pred = PipelineNB.predict(df_test['text'])
y_true = df_test['label']
accuracy = accuracy_score(y_true, y_pred)
accuracy

#### 4. Accuracy report

In [16]:
# Own implementation
# Training set
y_pred = df_train['text'].apply(lambda text : testNaiveBayes(text, logprior, loglikehood, labels, vocabulary))
y_true = df_train['label']
print("Own implementation - Training set accuracy: ", accuracy_score(y_true, y_pred))

# Test set
y_pred = df_test['text'].apply(lambda text : testNaiveBayes(text, logprior, loglikehood, labels, vocabulary))
y_true = df_test['label']
print("Own implementation - Test set accuracy: ", accuracy_score(y_true, y_pred))

# Scikit-learn implementation
# Training set
y_pred = PipelineNB.predict(df_train['text'])
y_true = df_train['label']
print("Scikit-learn implementation - Training set accuracy: ", accuracy_score(y_true, y_pred))

# Test set
y_pred = PipelineNB.predict(df_test['text'])
y_true = df_test['label']
print("Scikit-learn implementation - Test set accuracy: ", accuracy_score(y_true, y_pred))

In [17]:
cm = confusion_matrix(y_true, y_pred)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

fig = plt.figure(figsize=(10, 5))
fig.suptitle('Accuracy', fontsize=16)

for i, cm in zip([1, 2], [cm, cmn]):
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    ax = plt.subplot(1, 2, i)
    disp.plot(ax=ax)
    
fig.tight_layout(pad=2)
plt.show()


#### 5. Most likely, the scikit-learn implementation will give better results. Looking at the documentation, explain why it could be the case.

This could be explained because the multinomial Naive Bayes classifier is better suited for classification with discrete features.

#### 6. Why is accuracy a sufficient measure of evaluation here?

Because there is only 2 classes so on a test the prediction is either true or false. This is well represented by a proportion of valid and wrong tests.

#### 7. Using one of the implementation, take at least 2 wrongly classified example from the test set and try explaining why the model failed.

We will use our own implementation and show two texts where the implementation failed to predict the right class :

In [18]:
y_pred = df_test['text'].apply(lambda text : testNaiveBayes(text, logprior, loglikehood, labels, vocabulary))
y_true = df_test['label']

df_wrong_pred = df_test[y_true != y_pred]

pd.set_option('display.max_colwidth', None)
df_wrong_pred.iloc[[0, -1]]


Here is two examples where the program should have return a class 0 and 1 (negative, positive).  
  
-> The first one is ... 
  
-> The second text is ...

#### 8. [BONUS] What are the top 10 most important words (features) for each class?

In [19]:
import nltk
nltk.download('stopwords')

In [20]:
# Get the most important words for each class
def getImportantWords(loglikehood, classes, vocabulary):
    '''
        Input:
            loglikehood: list of log likehood of each class
            classes: list of classes
            vocabulary: set of vocabulary
        Output:
            important_words: dictionary of important words for each class
    '''
    important_words = {}
    for classes_c in classes:
        important_words[classes_c] = sorted(loglikehood[classes_c], key=loglikehood[classes_c].get, reverse=True)[:100]
    return important_words

importantWords = getImportantWords(loglikehood, labels, vocabulary)
for classes_c in labels:
    print("Class: ", classes_c)
    print(importantWords[classes_c])
    print()

In [21]:
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))

def removeStopWords(importantWords):
    '''
        Input:
            importantWords: dictionary of important words for each class
        Output:
            importantWords: dictionary of important words for each class
    '''
    for classes_c in labels:
        importantWords[classes_c] = [word for word in importantWords[classes_c] if word not in stops]
    return importantWords

importantWords = removeStopWords(importantWords)
for classes_c in labels:
    print("Class: ", classes_c)
    print(importantWords[classes_c])
    print()

Here are the most important words for each class that brings the decision to likely select the class it corresponds to. In class 1 we can see that the word 'love' or 'great' makes the decision favorable towards this class because it doesnt appear in the class 0.

# Stemming and Lemmatization

#### 1. Adding stemming to pretreatment

In [22]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')

In [23]:
import re

def stemmingPreProcessing(text):
    '''
        Input:
            text: string
        Output:
            text: string after stemming
    '''
    re_word = re.compile(r"^\w+$")
    stemmer = SnowballStemmer("english")
    stemmed = [stemmer.stem(word) for word in word_tokenize(text.lower().replace("<br />", "")) if re_word.match(word)]
    return " ".join(stemmed)

In [None]:
# Display the train dataframe after stemming
pd.set_option('display.max_colwidth', 100)
df_train = pd.DataFrame(dataset['train'])
df_train['text'] = df_train['text'].apply(lambda text: stemmingPreProcessing(text))
df_train.head()

In [None]:
# Display the test dataframe after stemming
df_test = pd.DataFrame(dataset['test'])
df_test['text'] = df_test['text'].apply(lambda text: stemmingPreProcessing(text))
df_test.head()

#### 2. Train and evaluate your model again with these pretreatment.

In [None]:
# Train the stemmed data
logprior, loglikehood, vocabulary = trainNaiveBayes(df_train, labels)

In [None]:
# Testing the train set
y_pred = df_train['text'].apply(lambda text : testNaiveBayes(text, logprior, loglikehood, labels, vocabulary))
y_true = df_train['label']
print("Own implementation (Stemming) - Training set accuracy: ", accuracy_score(y_true, y_pred))

# Testing the test set
y_pred = df_test['text'].apply(lambda text : testNaiveBayes(text, logprior, loglikehood, labels, vocabulary))
y_true = df_test['label']
print("Own implementation (Stemming) - Test set accuracy: ", accuracy_score(y_true, y_pred))


#### 3. Are the results better or worse? Try explaining why the accuracy changed.  
  
The results show the stemming process to be less effective.  
The accuracy changed because the stemming pretreatment shorten words so that we don't have multiple occurence of the same word which could take multiple forms like 'enjoy', 'enjoys', 'enjoyed', 'enjoying'. Those are all grouped and written as 'enjoy'.

In [None]:
importantWords = getImportantWords(loglikehood, labels, vocabulary)
importantWords = removeStopWords(importantWords)
for classes_c in labels:
    print("Class: ", classes_c)
    print(importantWords[classes_c])
    print()