## Importation des librairies nécessaires

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import pickle

## Importation des données

In [54]:
data = pd.read_csv('morocco_world_news_articles.csv')
X, y = data.content, data.category

## Text Preprocessing

In [55]:
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()

documents = []

for sen in range(0, len(X)):
  document = re.sub(r'\W', ' ', str(X[sen]))
  document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
  document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
  document = re.sub(r'\s+', ' ', document, flags=re.I)
  document = re.sub(r'^b\s+', '', document)
  document = document.lower()
  document = document.split()
  document = [stemmer.lemmatize(word) for word in document]
  document = ' '.join(document)
  documents.append(document)

## Bag Of Words

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

## TF-IDF

In [57]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

## Training Model

In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

In [60]:
y_pred = classifier.predict(X_test)

## Model Evaluation

In [61]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

    covid-19       0.88      0.92      0.90       277
     economy       0.75      0.79      0.77       130
   education       0.80      0.88      0.84       279
maghreb-news       0.83      0.88      0.86       294
     opinion       0.84      0.92      0.88       280
    politics       0.89      0.37      0.52        84
     society       1.00      0.12      0.22        48

    accuracy                           0.83      1392
   macro avg       0.85      0.70      0.71      1392
weighted avg       0.84      0.83      0.82      1392



## Saving Model

In [62]:
with open('text_classifier', 'wb') as picklefile:
  pickle.dump(classifier,picklefile)

## Loading Model

In [63]:
with open('text_classifier', 'rb') as training_model:
  model = pickle.load(training_model)

In [64]:
y_pred2 = model.predict(X_test)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

    covid-19       0.88      0.92      0.90       277
     economy       0.75      0.79      0.77       130
   education       0.80      0.88      0.84       279
maghreb-news       0.83      0.88      0.86       294
     opinion       0.84      0.92      0.88       280
    politics       0.89      0.37      0.52        84
     society       1.00      0.12      0.22        48

    accuracy                           0.83      1392
   macro avg       0.85      0.70      0.71      1392
weighted avg       0.84      0.83      0.82      1392

