## Importation des librairies nécessaires

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import pickle

## Importation des données

In [9]:
data = pd.read_csv('le_matin_articles.csv')
X, y = data.content, data.category

## Text Preprocessing

In [10]:
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()

documents = []

for sen in range(0, len(X)):
  document = re.sub(r'\W', ' ', str(X[sen]))
  document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
  document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
  document = re.sub(r'\s+', ' ', document, flags=re.I)
  document = re.sub(r'^b\s+', '', document)
  document = document.lower()
  document = document.split()
  document = [stemmer.lemmatize(word) for word in document]
  document = ' '.join(document)
  documents.append(document)

## Bag Of Words

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('french'))
X = vectorizer.fit_transform(documents).toarray()

## TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

## Training Model

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=0)

In [15]:
y_pred = classifier.predict(X_test)

## Model Evaluation

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_pred))

                   precision    recall  f1-score   support

activites-royales       0.32      0.39      0.35       192
          culture       0.84      0.41      0.55       230
         economie       0.43      0.54      0.48       295
           emploi       0.91      0.19      0.32        52
            monde       0.51      0.43      0.47       253
           nation       0.40      0.49      0.44       284
          regions       0.38      0.25      0.30       228
          societe       0.34      0.46      0.39       266

         accuracy                           0.42      1800
        macro avg       0.51      0.39      0.41      1800
     weighted avg       0.47      0.42      0.43      1800



## Saving Model

In [17]:
with open('text_classifier', 'wb') as picklefile:
  pickle.dump(classifier,picklefile)

## Loading Model

In [18]:
with open('text_classifier', 'rb') as training_model:
  model = pickle.load(training_model)

In [19]:
y_pred2 = model.predict(X_test)
print(classification_report(y_test, y_pred2))

                   precision    recall  f1-score   support

activites-royales       0.32      0.39      0.35       192
          culture       0.84      0.41      0.55       230
         economie       0.43      0.54      0.48       295
           emploi       0.91      0.19      0.32        52
            monde       0.51      0.43      0.47       253
           nation       0.40      0.49      0.44       284
          regions       0.38      0.25      0.30       228
          societe       0.34      0.46      0.39       266

         accuracy                           0.42      1800
        macro avg       0.51      0.39      0.41      1800
     weighted avg       0.47      0.42      0.43      1800

