# Import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
import pandas as pd
import re
import string
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Load the ds

In [2]:
data_model = pd.read_csv('data_model.csv')

In [3]:
data_model.shape

(18108, 5001)

# Train Test Split

In [4]:
# Define features and target
X = data_model["title_text"]
y = data_model["label"]

# Train Test Split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Vectorizing the text
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

# Handle imbalanced dataset

In [5]:
# Apply SMOTE + ENN to balance the training dataset
smote_enn = SMOTEENN(random_state=42)
xv_train_resampled, y_train_resampled = smote_enn.fit_resample(xv_train, y_train)

# Logistic regression

In [6]:
lr = LogisticRegression()
lr.fit(xv_train_resampled, y_train_resampled)
predictions_lr = lr.predict(xv_test)

print("Logistic Regression Performance:")
print(classification_report(y_test, predictions_lr))

Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.81      0.98      0.89      1367
           1       0.98      0.86      0.92      2255

    accuracy                           0.91      3622
   macro avg       0.90      0.92      0.90      3622
weighted avg       0.92      0.91      0.91      3622



In [7]:
"""
The Logistic Regression model performs well, achieving an overall accuracy of 91%. It shows high precision for class 1 
(Not A Fake News) but slightly lower recall (86%), meaning that while it correctly identifies most fake news, 
it occasionally misclassifies true news as fake. Its precision and recall for class 0 (Fake News) are strong, making it 
reliable for detecting fake news but with some room for improvement in handling true news.
"""

'\nThe Logistic Regression model performs well, achieving an overall accuracy of 91%. It shows high precision for class 1 \n(Not A Fake News) but slightly lower recall (86%), meaning that while it correctly identifies most fake news, \nit occasionally misclassifies true news as fake. Its precision and recall for class 0 (Fake News) are strong, making it \nreliable for detecting fake news but with some room for improvement in handling true news.\n'

# Random forest

In [8]:
rf = RandomForestClassifier(random_state=42)
rf.fit(xv_train_resampled, y_train_resampled)
predictions_rf = rf.predict(xv_test)

print("Random Forest Classifier Performance:")
print(classification_report(y_test, predictions_rf))

Random Forest Classifier Performance:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1367
           1       0.98      0.92      0.95      2255

    accuracy                           0.94      3622
   macro avg       0.93      0.94      0.94      3622
weighted avg       0.94      0.94      0.94      3622



In [9]:
"""
The Random Forest model shows excellent performance with an accuracy of 94%. It balances precision and recall well across both 
classes, with precision and recall for class 1 (Not A Fake News) at 98% and 92%, respectively. It performs particularly well 
in minimizing false negatives, making it very robust for this problem. This model edges out the Logistic Regression model due 
to better performance on both classes.
"""

'\nThe Random Forest model shows excellent performance with an accuracy of 94%. It balances precision and recall well across both \nclasses, with precision and recall for class 1 (Not A Fake News) at 98% and 92%, respectively. It performs particularly well \nin minimizing false negatives, making it very robust for this problem. This model edges out the Logistic Regression model due \nto better performance on both classes.\n'

# Suport vector machine

In [None]:
svm = SVC(kernel='linear')
svm.fit(xv_train_resampled, y_train_resampled)
predictions_svm = svm.predict(xv_test)

print("Support Vector Machine Performance:")
print(classification_report(y_test, predictions_svm))

In [None]:
"""
The SVM model also delivers high performance, matching the Random Forest model in overall accuracy (94%). It demonstrates a 
strong balance between precision and recall, achieving 98% precision and 92% recall for class 1 (Not A Fake News), similar to 
the Random Forest. Its recall for class 0 (Fake News) is slightly higher, making it highly reliable in identifying fake news 
while maintaining accuracy in classifying true news.
"""

# Decision tree

In [None]:
dt = DecisionTreeClassifier()
dt.fit(xv_train_resampled, y_train_resampled)
predictions_dt = dt.predict(xv_test)

print("Decision Tree Classifier Performance:")
print(classification_report(y_test, predictions_dt))

In [None]:
"""
The Decision Tree model shows strong results, with an accuracy of 91%. It has a slight drop in precision for class 0 
(Fake News) compared to other models (84%) but maintains a good balance overall, with 95% precision for class 1 
(Not A Fake News). Although its performance is not as high as Random Forest or SVM, it is still a solid option, particularly 
when interpretability is a key concern due to the nature of decision trees.
"""

# MODEL TESTING

In [None]:
# Define text cleaning function
def text_cleaner(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove square brackets and their contents
    text = re.sub(r'\[.*?\]', '', text)
    # Replace non-word characters with a space
    text = re.sub(r"\W", " ", text)
    # Remove URLs or website addresses
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>+', '', text)
    # Remove punctuation marks
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    # Remove newline characters
    text = re.sub(r'\n', '', text)
    # Remove alphanumeric words containing digits
    text = re.sub(r'\w*\d\w*', '', text)
    # Expand contractions (e.g., "it's" to "it is")
    text = contractions.fix(text)
    # Remove stopwords and perform lemmatization
    stopwords_set = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords_set]
    return " ".join(tokens)

# Function to output the label based on prediction
def output_label(prediction):
    return "Not A Fake News" if prediction == 1 else "Fake News"

# MODEL TESTING
# Manual testing with rf classifier
def rf_manual_testing(news, vectorization, rf):
    # Prepare the new input for testing
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    
    # Apply preprocessing using text_cleaner
    new_def_test["text"] = new_def_test["text"].apply(text_cleaner)
    new_x_test = new_def_test["text"]
    
    # Vectorize the new input
    new_xv_test = vectorization.transform(new_x_test)
    
    # Get prediction from fr model
    pred_RF = rf.predict(new_xv_test)
    
    # Print the prediction from the rf model
    print("\nRandom Forest Prediction: {}".format(output_label(pred_RF[0])))

# Input for testing
news = str(input("Enter the news article text for testing: "))
rf_manual_testing(news, vectorization, rf)

In [None]:
# Define the path
model_path = r'C:\Users\elora\Desktop\IRONHACK\PROJECTS\FINAL PROJECT FAKE NEWS\DATA AND NOTEBOOKS\streamlit_app\Models'

# Save the trained rf model to a pickle file
with open(f'{model_path}\\random_forest.pkl', 'wb') as model_file:
    pickle.dump(rf, model_file)

# Save the TF-IDF vectorizer to a pickle file
with open(f'{model_path}\\tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorization, vectorizer_file)

print("Random Forest model and vectorizer saved successfully.")