In [18]:
import numpy as np
import pandas as pd

import string
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

import pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
with open('models/document_classifier_model.pkl', 'rb') as file:
    load_model = pickle.load(file)

In [20]:
file_content = []

with open(f'new-test-data.txt', 'r', encoding='utf8') as file_reader:
    for line in file_reader:
        file_content.append(line)

file_content

["Despite qualifying for the quarter-finals of the King's Cup after beating Celta in extra time, the joy was short-lived for Real Madrid, who this morning received further bad news due to injury.\n",
 '\n',
 'On Friday January 17, the capital club announced that Eduardo Camavinga, who came off the bench during the cup match, is suffering from muscle damage to the biceps femoris in his left leg. \n',
 '\n',
 "After undergoing various tests at the club's medical department, the Frenchman has been ruled out of the next few matches. The club says that his return to the squad will depend on the progress of his recovery, as is usually the case. \n",
 '\n',
 "However, initial reports suggest that he will be unavailable for around three weeks, which means that Carlo Ancelotti, on paper, will not be able to count on him for the matches against Las Palmas, Salzburg, Real Valladolid, Brest and Espanyol, casting doubt on his presence for the decisive derby against Atlético de Madrid, which takes p

In [21]:
df = pd.DataFrame(data={'content': file_content})
df

Unnamed: 0,content
0,Despite qualifying for the quarter-finals of t...
1,\n
2,"On Friday January 17, the capital club announc..."
3,\n
4,After undergoing various tests at the club's m...
5,\n
6,"However, initial reports suggest that he will ..."
7,\n
8,Translated with DeepL.com (free version)


In [22]:
###On définit une fonction qui supprime les ponctuations
def remove_punctuation(text):
    return ''.join([t for t in text if t not in string.punctuation])

In [23]:
###On définit une fonction qui applique la tokenisation sur nos données.
def tokenizeText(text):
    return ' '.join(word for word in re.split('\W+', text.lower()))

In [24]:
french_stopwords = stopwords.words('french')
english_stopwords = stopwords.words('english')

In [25]:
###On définit une fonction qui enlève les stopwords (Anglais et Français)
def applyStopwords(text):
    return ' '.join(word for word in text.split() if (word not in english_stopwords) and (word not in french_stopwords))

In [26]:
###On applique la lemmatisation sur nos données
def applyLemmatization(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split())

In [27]:
with open('models/count_vectorizer.pkl', 'rb') as file:
    countVectorizer = pickle.load(file)

In [28]:
def input_file_processing(_dsf: pd.DataFrame):
    ###Suppression des doublons
    df.drop_duplicates(inplace = True)

    ###Suppression de tous les espaces (' ', \n, ...)
    df['content'] = df['content'].str.strip().replace('', np.nan)
    df.dropna(inplace=True)

    ###Suppression des ponctuations
    df['cleaned_content'] = df['content'].apply(lambda x: remove_punctuation(x))

    ###On applique la tokenisation
    df['cleaned_content'] = df['cleaned_content'].apply(lambda x: tokenizeText(x))

    ###On applique les stopwords
    df['cleaned_content'] = df['cleaned_content'].apply(lambda x: applyStopwords(x))

    ###On applique la lemmatisation
    df["cleaned_content"] = df["cleaned_content"].apply(lambda x: applyLemmatization(x))

    ###On va transformer nos données (textuelles) en vecteur (utiliser transform sur les nouvelles données)
    cleaned_content_vectorized= countVectorizer.transform(df['cleaned_content'])

    return cleaned_content_vectorized.toarray()

In [29]:
X = input_file_processing(df)
X.shape

(5, 30546)

In [30]:
y_predict = load_model.predict(X)
y_predict

array(['sport', 'sport', 'sport', 'sport', 'tech'], dtype=object)

In [31]:
def find_most_frequent_item(arr):
  unique, counts = np.unique(arr, return_counts=True)
  return unique[np.argmax(counts)]

In [32]:
find_most_frequent_item(y_predict)

'sport'

In [33]:
predicted_probs = load_model.predict_proba(X)
predicted_probs

array([[1.01184967e-03, 9.82702233e-04, 3.78131732e-04, 9.95443769e-01,
        2.18354743e-03],
       [9.00719585e-04, 7.98508178e-04, 3.40100369e-04, 9.96320184e-01,
        1.64048818e-03],
       [1.44184178e-03, 1.18641039e-03, 5.57441767e-04, 9.95445558e-01,
        1.36874760e-03],
       [3.15619943e-03, 3.82414258e-03, 1.27704289e-03, 9.83699245e-01,
        8.04337046e-03],
       [3.40176836e-02, 1.75985646e-01, 3.44162389e-02, 2.18118851e-02,
        7.33768546e-01]])

In [34]:
predicted_probs.max()

0.9963201836864433

In [3]:
# from transformers import pipeline

# def generer_titre_intelligent(contenu):
#     # Charge un modèle pré-entraîné pour la génération de texte
#     generer_titre = pipeline("summarization", model="facebook/bart-large-cnn")
#     # Génére un résumé du contenu, qui servira de titre
#     resume = generer_titre(contenu, max_length=60, min_length=10, do_sample=False)
#     return resume[0]['summary_text']

# # Lis le contenu du fichier (remplace le chemin par celui de ton fichier)
# with open('new-test-data.txt', 'r', encoding='utf-8') as f:
#     contenu = f.read()

# titre = generer_titre_intelligent(contenu)
# print(f"Titre généré : {titre}")