In [1]:
import numpy as np
import pandas as pd

import string
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

import pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
with open('models/document_classifier_model.pkl', 'rb') as file:
    load_model = pickle.load(file)

In [3]:
file_content = []

with open(f'new-test-data.txt', 'r', encoding='utf8') as file_reader:
    for line in file_reader:
        file_content.append(line)

file_content

['The president of the Socialist group listed the concessions obtained, and said he wanted to give “every chance to negotiation”. He warned, however, that “a vote of no confidence is possible at any time”. The vote on the motion of no confidence tabled by the “insoumis” will take place between 5:38pm and 5:58pm.']

In [4]:
df = pd.DataFrame(data={'content': file_content})
df

Unnamed: 0,content
0,The president of the Socialist group listed th...


In [5]:
###On définit une fonction qui supprime les ponctuations
def remove_punctuation(text):
    return ''.join([t for t in text if t not in string.punctuation])

In [6]:
###On définit une fonction qui applique la tokenisation sur nos données.
def tokenizeText(text):
    return ' '.join(word for word in re.split('\W+', text.lower()))

In [7]:
french_stopwords = stopwords.words('french')
english_stopwords = stopwords.words('english')

In [8]:
###On définit une fonction qui enlève les stopwords (Anglais et Français)
def applyStopwords(text):
    return ' '.join(word for word in text.split() if (word not in english_stopwords) and (word not in french_stopwords))

In [9]:
###On applique la lemmatisation sur nos données
def applyLemmatization(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split())

In [10]:
with open('models/count_vectorizer.pkl', 'rb') as file:
    countVectorizer = pickle.load(file)

In [11]:
def input_file_processing(_dsf: pd.DataFrame):
    ###Suppression des doublons
    df.drop_duplicates(inplace = True)

    ###Suppression de tous les espaces (' ', \n, ...)
    df['content'] = df['content'].str.strip().replace('', np.nan)
    df.dropna(inplace=True)

    ###Suppression des ponctuations
    df['cleaned_content'] = df['content'].apply(lambda x: remove_punctuation(x))

    ###On applique la tokenisation
    df['cleaned_content'] = df['cleaned_content'].apply(lambda x: tokenizeText(x))

    ###On applique les stopwords
    df['cleaned_content'] = df['cleaned_content'].apply(lambda x: applyStopwords(x))

    ###On applique la lemmatisation
    df["cleaned_content"] = df["cleaned_content"].apply(lambda x: applyLemmatization(x))

    ###On va transformer nos données (textuelles) en vecteur (utiliser transform sur les nouvelles données)
    cleaned_content_vectorized= countVectorizer.transform(df['cleaned_content'])

    return cleaned_content_vectorized.toarray()

In [12]:
X = input_file_processing(df)
X.shape

(1, 30546)

In [13]:
y_predict = load_model.predict(X)
y_predict

array(['politics'], dtype=object)

In [19]:
def find_most_frequent_item(arr):
  unique, counts = np.unique(arr, return_counts=True)
  return unique[np.argmax(counts)]

In [20]:
find_most_frequent_item(y_predict)

'politics'

In [21]:
predicted_probs = load_model.predict_proba(X)
predicted_probs

array([[0.11773509, 0.02342258, 0.85159163, 0.00287646, 0.00437424]])

In [22]:
predicted_probs.max()

0.8515916275682073