In [30]:
# Import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# nltk for text cleaning
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, r2_score
from sklearn.ensemble import RandomForestClassifier

from sqlalchemy import create_engine
from config import username, password, db_name
engine = create_engine(f'postgresql://{username}:{password}@localhost:5432/{db_name}')
connection = engine.connect()

In [None]:
# Load the dataset
dataset = pd.read_sql("SELECT * FROM news", connection)
dataset.head()

In [35]:
# Vectorise word data
tfidf = TfidfVectorizer(lowercase=False, stop_words='english', max_features=10000)

In [36]:
text = dataset['text']
X = tfidf.fit_transform(text)
y = dataset['label']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [40]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=None, n_jobs=-1)

rf_model = rf.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

In [45]:
rf_f1 = round(f1_score(y_test, y_pred, average= 'weighted'), 3)
rf_accuracy = round((accuracy_score(y_test, y_pred) * 100), 2)

print("classification report :")
print(classification_report(y_test, y_pred))
print("------------------------------------------------------")
print("confusion matrix : ")
print(confusion_matrix(y_test, y_pred))
print("------------------------------------------------------")
print("Accuracy : " , rf_accuracy , "%")
print("f1_score : " , rf_f1)

classification report :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6412
           1       1.00      1.00      1.00      6859

    accuracy                           1.00     13271
   macro avg       1.00      1.00      1.00     13271
weighted avg       1.00      1.00      1.00     13271

------------------------------------------------------
confusion matrix : 
[[6401   11]
 [  21 6838]]
------------------------------------------------------
Accuracy :  99.76 %
f1_score :  0.998


In [43]:
import joblib
# Save the model
joblib.dump(rf_model , 'rf_model.pkl')

# Save the vectorizer
joblib.dump(tfidf, open("rf_model.pkl", "wb"))