In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [30]:
dataTrain = pd.read_csv('data/HAI817_Projet_train.csv')
dataTest = pd.read_csv('data/HAI817_Projet_test.csv')
dataTrain

Unnamed: 0,public_id,text,title,our rating
0,5a228e0e,Distracted driving causes more deaths in Canad...,"You Can Be Fined $1,500 If Your Passenger Is U...",false
1,30c605a1,Missouri politicians have made statements afte...,Missouri lawmakers condemn Las Vegas shooting,mixture
2,c3dea290,Home Alone 2: Lost in New York is full of viol...,CBC Cuts Donald Trump's 'Home Alone 2' Cameo O...,mixture
3,f14e8eb6,But things took a turn for the worse when riot...,Obama’s Daughters Caught on Camera Burning US ...,false
4,faf024d6,It’s no secret that Epstein and Schiff share a...,Leaked Visitor Logs Reveal Schiff’s 78 Visits ...,false
...,...,...,...,...
1259,47423bb6,More than four million calls to the taxman are...,Taxman fails to answer four million calls a ye...,true
1260,097c142a,More under-18s are being taken to court for se...,Police catch 11‑year‑olds being used to sell d...,true
1261,08bc59f4,The Government’s much vaunted Help to Buy Isa ...,"Help to Buy Isa scandal: 500,000 first-time bu...",false
1262,af3393ce,The late Robin Williams once called cocaine “G...,A coke-snorting generation of hypocrites,true


In [11]:
# nltk download
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Luna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luna\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [31]:
# text preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
punctuations = set(string.punctuation)

def remove_stop_words(text):
    # Tokenize the text
    words = word_tokenize(text)
    
    # convert to lowercase
    words = [word.lower() for word in words if word.isalpha()]
    
    # remove stop words
    words = [word for word in words if word not in stop_words]
    
    # remove punctuation
    words = [word for word in words if word not in punctuations]

    return ' '.join(words)

XpreprocessTrain = dataTrain['text'].apply(remove_stop_words)
XpreprocessTest = dataTest['text'].apply(remove_stop_words)
ytxtTrain = dataTrain['our rating']
ytxtTest = dataTest['our rating']
XpreprocessTrain[0]


'distracted driving causes deaths canada impaired driving every province territory laws driving operating cell phone tell passengers stay phones driving measures necessary distracted driving claimed lives impaired driving provinces like british columbia ontario quebec alberta nova scotia manitoba newfoundland labrador mobile phones even held passenger dangerous distraction driver starting next week distracted screen held passenger attracts penalty three demerit points drivers screens mix matter holding device using facetime taking selfies driver showing driver funny cat video provinces mobile phone categorised visual display unit meaning considered akin television screen important practice safe driving sake fellow drivers canada cracking distracted driving problem rollout stricter laws impose harsher penalties heftier fines guilty offenders taking effect next week adds serious penalties convicted distracted driving'

In [41]:
# Tokenization
class Tokenizer:
    def __init__(self):
        self.txt2token = {}
        self.token2txt = {}
        self.tokens = set()

    def fit(self, text: np.ndarray):
        count = 0
        for txt in text:
            for word in txt.split():
                if word not in self.txt2token:
                    self.txt2token[word] = count
                    self.token2txt[count] = word
                    count += 1
                self.tokens.add(self.txt2token[word])

    def histogram(self, text):
        tokenized_text = np.zeros((len(text), len(self.tokens)), dtype=np.uint64)
        for i, txt in enumerate(text):
            for word in txt.split():
                tokenized_text[i][self.txt2token[word]] += 1

        return tokenized_text
    
    def transform(self, text):
        tokenized_text = []
        for txt in text:
            tokenized_text.append([self.txt2token[word] for word in txt.split() if word in self.txt2token])
        return tokenized_text


Xtokenizer = Tokenizer()
Xtokenizer.fit(np.concatenate((np.array(XpreprocessTrain), np.array(XpreprocessTest))))
XtokenTrain = Xtokenizer.transform(XpreprocessTrain)
XtokenTest = Xtokenizer.transform(XpreprocessTest).astype(np.uint8)

Ytokenizer = Tokenizer()
Ytokenizer.fit(np.concatenate((np.array(ytxtTrain), np.array(ytxtTest))))
YtokenTrain = Ytokenizer.transform(ytxtTrain)
YtokenTest = Ytokenizer.transform(ytxtTest)

XtokenTrain.shape, XtokenTest.shape, YtokenTrain.shape, YtokenTest.shape
            

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 1]], dtype=uint8)

In [26]:
# Y
y = np.zeros((len(ytxt), 1), dtype=np.uint8)
ytokens = set()
y2token = {}
token2y = {}
count = 0
for i, rating in enumerate(ytxt):
    if rating not in y2token:
        y2token[rating] = count
        token2y[count] = rating
        count += 1
    y[i] = y2token[rating]
    ytokens.add(y2token[rating])

y


array([[0],
       [1],
       [1],
       ...,
       [0],
       [3],
       [3]], dtype=uint8)

In [27]:
# classification
# naive bayes

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train = X
y_train = y

# Create a CountVectorizer to convert text into numerical features
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_vectorized)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6561264822134387
