In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from typing import List, Tuple, Union, Dict, Generic, TypeVar

In [3]:
dataTrain = pd.read_csv('data/HAI817_Projet_train.csv')
dataTest = pd.read_csv('data/HAI817_Projet_test.csv')
dataTrain

Unnamed: 0,public_id,text,title,our rating
0,5a228e0e,Distracted driving causes more deaths in Canad...,"You Can Be Fined $1,500 If Your Passenger Is U...",false
1,30c605a1,Missouri politicians have made statements afte...,Missouri lawmakers condemn Las Vegas shooting,mixture
2,c3dea290,Home Alone 2: Lost in New York is full of viol...,CBC Cuts Donald Trump's 'Home Alone 2' Cameo O...,mixture
3,f14e8eb6,But things took a turn for the worse when riot...,Obama’s Daughters Caught on Camera Burning US ...,false
4,faf024d6,It’s no secret that Epstein and Schiff share a...,Leaked Visitor Logs Reveal Schiff’s 78 Visits ...,false
...,...,...,...,...
1259,47423bb6,More than four million calls to the taxman are...,Taxman fails to answer four million calls a ye...,true
1260,097c142a,More under-18s are being taken to court for se...,Police catch 11‑year‑olds being used to sell d...,true
1261,08bc59f4,The Government’s much vaunted Help to Buy Isa ...,"Help to Buy Isa scandal: 500,000 first-time bu...",false
1262,af3393ce,The late Robin Williams once called cocaine “G...,A coke-snorting generation of hypocrites,true


In [4]:
# nltk download
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Luna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# text preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
punctuations = set(string.punctuation)

def remove_stop_words(text):
    # Tokenize the text
    words = word_tokenize(text)
    
    # convert to lowercase
    words = [word.lower() for word in words if word.isalpha()]
    
    # remove stop words
    words = [word for word in words if word not in stop_words]
    
    # remove punctuation
    words = [word for word in words if word not in punctuations]

    return ' '.join(words)

XpreprocessTrain = dataTrain['text'].apply(remove_stop_words)
XpreprocessTest = dataTest['text'].apply(remove_stop_words)
ytxtTrain = dataTrain['our rating']
ytxtTest = dataTest['our rating']
XpreprocessTrain[0]


'distracted driving causes deaths canada impaired driving every province territory laws driving operating cell phone tell passengers stay phones driving measures necessary distracted driving claimed lives impaired driving provinces like british columbia ontario quebec alberta nova scotia manitoba newfoundland labrador mobile phones even held passenger dangerous distraction driver starting next week distracted screen held passenger attracts penalty three demerit points drivers screens mix matter holding device using facetime taking selfies driver showing driver funny cat video provinces mobile phone categorised visual display unit meaning considered akin television screen important practice safe driving sake fellow drivers canada cracking distracted driving problem rollout stricter laws impose harsher penalties heftier fines guilty offenders taking effect next week adds serious penalties convicted distracted driving'

In [15]:
# Tokenization
class Tokenizer:
    def __init__(self, dtype: type = np.int64):
        self.txt2token = {}
        self.token2txt = {}
        self.tokens = set()
        self.dtype = dtype

    def fit(self, text: np.ndarray):
        count = 0
        for txt in text:
            for word in txt.split():
                if word not in self.txt2token:
                    self.txt2token[word] = count
                    self.token2txt[count] = word
                    count += 1
                self.tokens.add(self.txt2token[word])

    def histogram(self, text):
        tokenized_text = np.zeros((len(text), len(self.tokens)), dtype=self.dtype)
        for i, txt in enumerate(text):
            for word in txt.split():
                tokenized_text[i][self.txt2token[word]] += 1

        return tokenized_text
    
    def transform(self, text) -> list[np.ndarray]:
        tokenized_text = []
        for txt in text:
            tokenized_text.append(np.array([self.txt2token[word] for word in txt.split() if word in self.txt2token], dtype=self.dtype))
        return tokenized_text


Xtokenizer = Tokenizer(dtype=np.int64)
Xtokenizer.fit(np.concatenate((np.array(XpreprocessTrain), np.array(XpreprocessTest))))
XtokenTrain = Xtokenizer.transform(XpreprocessTrain)
XhistTrain = Xtokenizer.histogram(XpreprocessTrain)
XtokenTest = Xtokenizer.transform(XpreprocessTest)
XhistTest = Xtokenizer.histogram(XpreprocessTest)

Ytokenizer = Tokenizer(dtype=np.uint8)
# ytxtTrain.drop("mixture", inplace=True)
# ytxtTest.drop(['mixture', 'other'])
Ytokenizer.fit(np.concatenate((np.array(ytxtTrain), np.array(ytxtTest))))
YtokenTrain = Ytokenizer.transform(ytxtTrain)
YtokenTest = Ytokenizer.transform(ytxtTest)

XtokenTrain[0], YtokenTrain[0], Ytokenizer.txt2token.keys()
            

KeyError: "['mixture', 'other'] not found in axis"

In [8]:
# oversampling and undersampling
def balance_data(X, Xhist, Y):
    # Convert X and Y to numpy arrays for easier manipulation
    Y_np = np.array([y[0] for y in Y])
    
    unique_classes, counts = np.unique(Y_np, return_counts=True)
    max_count = max(counts)
    
    X_balanced = []
    Xhist_balanced = []
    Y_balanced = []
    
    for cls in unique_classes:
        cls_indices = np.where(Y_np == cls)[0]
        cls_samples = [X[i] for i in cls_indices]
        cls_hist_samples = Xhist[cls_indices]
        cls_labels = [Y[i] for i in cls_indices]
        
        if counts[cls] < max_count:
            # Oversampling
            num_to_add = max_count - counts[cls]
            additional_indices = np.random.choice(cls_indices, num_to_add, replace=True)
            additional_samples = [X[i] for i in additional_indices]
            additional_hist_samples = Xhist[additional_indices]
            additional_labels = [Y[i] for i in additional_indices]
            
            X_balanced.extend(cls_samples + additional_samples)
            Xhist_balanced.extend(np.vstack((cls_hist_samples, additional_hist_samples)))
            Y_balanced.extend(cls_labels + additional_labels)
        else:
            # Undersampling
            indices_to_keep = np.random.choice(cls_indices, max_count, replace=False)
            X_balanced.extend([X[i] for i in indices_to_keep])
            Xhist_balanced.extend(Xhist[indices_to_keep])
            Y_balanced.extend([Y[i] for i in indices_to_keep])
    
    # Shuffle the dataset
    perm = np.random.permutation(len(Y_balanced))
    X_balanced = [X_balanced[i] for i in perm]
    Xhist_balanced = np.array(Xhist_balanced)[perm]
    Y_balanced = [Y_balanced[i] for i in perm]
    
    return X_balanced, Xhist_balanced, Y_balanced

Xtrain_balanced, Xhist_train_balanced, Ytrain_balanced = balance_data(XtokenTrain, XhistTrain, YtokenTrain)
Xtest_balanced, Xhist_test_balanced, Ytest_balanced = balance_data(XtokenTest, XhistTest, YtokenTest)
len(Xtrain_balanced), len(Xtest_balanced)

In [13]:
unique_classes, counts = np.unique(np.array([y[0] for y in Ytrain_balanced]), return_counts=True)
unique_classes, counts

(array([0, 1, 2, 3], dtype=uint8), array([578, 578, 578, 578], dtype=int64))

In [10]:
# naive bayes
from collections import defaultdict, Counter

class NaiveBayesClassifier:
    def __init__(self):
        self.classes = None
        self.class_prior = {}
        self.word_likelihood = {}
        self.vocab = set()
        self.word_count = {}

    def fit(self, X, y):
        self.classes = np.unique(np.concatenate(y))
        total_samples = len(y)
        
        self.word_count = {c: Counter() for c in self.classes}
        class_count = Counter()

        for i in range(total_samples):
            label = y[i][0]
            class_count[label] += 1
            for word in X[i]:
                self.word_count[label][word] += 1
                self.vocab.add(word)
        
        self.class_prior = {c: count / total_samples for c, count in class_count.items()}
        
        self.word_likelihood = {c: {} for c in self.classes}
        vocab_size = len(self.vocab)
        for c in self.classes:
            total_words = sum(self.word_count[c].values())
            for word in self.vocab:
                self.word_likelihood[c][word] = (self.word_count[c][word] + 1) / (total_words + vocab_size)

    def predict(self, X):
        predictions = []
        for sample in X:
            log_probs = {}
            for c in self.classes:
                log_probs[c] = np.log(self.class_prior[c])
                for word in sample:
                    if word in self.vocab:
                        log_probs[c] += np.log(self.word_likelihood[c][word])
                    else:
                        log_probs[c] += np.log(1 / (sum(self.word_count[c].values()) + len(self.vocab)))
            
            predicted_class = max(log_probs, key=log_probs.get)
            predictions.append(predicted_class)
        
        return predictions
    
    
# Train the model
model = NaiveBayesClassifier()
model.fit(Xtrain_balanced, Ytrain_balanced)

# Predict the test set
Ypred = model.predict(Xtest_balanced)

confusMatrix = np.zeros((len(Ytokenizer.tokens), len(Ytokenizer.tokens)), dtype=np.int64)

accuracySum = 0
for i in range(len(Ypred)):
    accuracySum += Ypred[i] == Ytest_balanced[i]
    confusMatrix[Ytest_balanced[i][0]][Ypred[i]] += 1
accuracy = accuracySum / len(Ypred)

print('Accuracy:', accuracy)
    

Accuracy: [0.32777778]


In [11]:
def printConfusionMatrix(confusionMatrix):
    print('Confusion matrix:')
    # format nicely with column and row headers
    print('\t' + ''.join(["\t" + s + " (real)" for s in Ytokenizer.txt2token.keys()]))
    for i in range(len(Ytokenizer.txt2token.keys())):
        print(list(Ytokenizer.txt2token.keys())[i] + ' (pred)', end='\t')
        print('\t\t'.join(str(confusionMatrix[i][j]) for j in range(len(Ytokenizer.tokens))))
        
    print()

    # compute recall and precision
    recall = np.zeros(len(Ytokenizer.tokens))
    precision = np.zeros(len(Ytokenizer.tokens))
    for i in range(len(Ytokenizer.tokens)):
        recall[i] = confusionMatrix[i][i] / np.sum(confusionMatrix[i])
        precision[i] = confusionMatrix[i][i] / np.sum(confusionMatrix[:, i])
        
    for i in range(len(Ytokenizer.tokens)):
        print('Recall (' +  list(Ytokenizer.txt2token.keys())[i] + ') :', round(recall[i], 3))
        print('Precision (' +  list(Ytokenizer.txt2token.keys())[i] + ') :', round(precision[i], 3))
        print()
        
printConfusionMatrix(confusMatrix)

Confusion matrix:
		false (real)	mixture (real)	other (real)	true (real)
false (pred)	285		13		7		10
mixture (pred)	199		77		13		26
other (pred)	252		54		0		9
true (pred)	218		42		4		51

Recall (false) : 0.905
Precision (false) : 0.299

Recall (mixture) : 0.244
Precision (mixture) : 0.414

Recall (other) : 0.0
Precision (other) : 0.0

Recall (true) : 0.162
Precision (true) : 0.531



In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out
    
def train_nn(X, y, model, criterion, optimizer, num_epochs=5):
    train_dataset = TextDataset(X, y)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    for epoch in range(num_epochs):
        for i, (texts, labels) in enumerate(train_loader):
            texts = texts.float()
            labels = labels.long()
            
            # Forward pass
            outputs = model(texts)
            loss = criterion(outputs, labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if i == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item()}')
                
    print('Finished Training')
    
def test_nn(X: torch.Tensor, y: torch.Tensor, model):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        confusMatrix = np.zeros((num_classes, num_classes), dtype=np.int64)
        for texts, labels in zip(X, y):
            texts = texts.float().unsqueeze(0)
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += 1
            correct += (predicted == labels).sum().item()
            predicted = int(predicted.item())
            labels = int(labels.item())
            
            confusMatrix[predicted][labels] += 1

        print(f'Accuracy: {100 * correct / total}%')
        printConfusionMatrix(confusMatrix)
        
        
# Hyperparameters
input_size = len(Xtokenizer.tokens)
hidden_size = 1000
num_classes = len(Ytokenizer.tokens)
num_epochs = 30
learning_rate = 0.001

# Initialize the model
model = NeuralNetwork(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Prepare data
Xtrain = torch.tensor(Xhist_train_balanced).float()
Xtest = torch.tensor(Xhist_test_balanced).float()
YtokenTrain1D = torch.tensor([y[0] for y in Ytrain_balanced]).long()
YtokenTest1D = torch.tensor([y[0] for y in Ytest_balanced]).long()

if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()
    
    device = torch.device('cuda:0')
    Xtrain = Xtrain.to(device)
    Xtest = Xtest.to(device)
    YtokenTrain1D = YtokenTrain1D.to(device)
    YtokenTest1D = YtokenTest1D.to(device)
    
train_nn(Xtrain, YtokenTrain1D, model, criterion, optimizer, num_epochs)

# Test the model
test_nn(Xtest, YtokenTest1D, model)

Epoch [1/30], Step [1/73], Loss: 1.4006608724594116
Epoch [2/30], Step [1/73], Loss: 0.10689215362071991
Epoch [3/30], Step [1/73], Loss: 0.010510491207242012
Epoch [4/30], Step [1/73], Loss: 0.0036225696094334126
Epoch [5/30], Step [1/73], Loss: 0.0004917188198305666
Epoch [6/30], Step [1/73], Loss: 0.010139763355255127
Epoch [7/30], Step [1/73], Loss: 0.0029680593870580196
Epoch [8/30], Step [1/73], Loss: 0.0037949339020997286
Epoch [9/30], Step [1/73], Loss: 0.007726357318460941
Epoch [10/30], Step [1/73], Loss: 0.05107056722044945
Epoch [11/30], Step [1/73], Loss: 0.08436957001686096
Epoch [12/30], Step [1/73], Loss: 0.00899070966988802
Epoch [13/30], Step [1/73], Loss: 0.0001836079463828355
Epoch [14/30], Step [1/73], Loss: 0.00010499460768187419
Epoch [15/30], Step [1/73], Loss: 0.018175577744841576
Epoch [16/30], Step [1/73], Loss: 0.0002201511524617672
Epoch [17/30], Step [1/73], Loss: 0.007911665365099907
Epoch [18/30], Step [1/73], Loss: 0.027058914303779602
Epoch [19/30], St