In [1]:
import numpy as np
import pandas as pd
import re
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv("full_dataset.csv")

In [3]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [4]:
data['text'] = data['text'].apply(preprocess_text)

x = data['text']
y = data['label']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2)

In [5]:
vectorizer = TfidfVectorizer()
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [6]:
LR = LogisticRegression()
LR.fit(x_train_vectorized, y_train)

pred_lr = LR.predict(x_test_vectorized)

print(LR.score(x_test_vectorized, y_test))

0.9566732412886259


In [7]:
# DT = DecisionTreeClassifier()
# DT.fit(x_train_vectorized, y_train)

# pred_dt = DT.predict(x_test_vectorized)

# print(DT.score(x_test_vectorized, y_test))

In [8]:
# GB = GradientBoostingClassifier(random_state = 0)
# GB.fit(x_train_vectorized, y_train)

# pred_gb = GB.predict(x_test_vectorized)

# print(GB.score(x_test_vectorized, y_test))

In [9]:
# RF = RandomForestClassifier(random_state = 0)
# RF.fit(x_train_vectorized, y_train)

# pred_rf = RF.predict(x_test_vectorized)

# print(RF.score(x_test_vectorized, y_test))

In [10]:
joblib.dump(LR, 'lr_model.pkl')

['lr_model.pkl']

In [11]:
def output_lable(n):
    if n==0:
        return "Not a Fake News"
    elif n==1:
        return "Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test['text'] = new_def_test["text"].apply(preprocess_text)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorizer.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_LR_proba = LR.predict_proba(new_xv_test)
    
    confidence = max(pred_LR_proba[0])
    
    print(
        f"""
        LR prediction: {output_lable(pred_LR[0])}
        Confidence: {confidence * 100:.2f}%
        """
    )


In [12]:
with open("test_true.txt", "r", encoding="utf-8") as f:
    text_true = f.read()

with open("test_fake.txt", "r", encoding="utf-8") as f:
    text_fake = f.read()
    
manual_testing(text_true)
manual_testing(text_fake)


        LR prediction: Not a Fake News
        Confidence: 55.90%
        

        LR prediction: Fake News
        Confidence: 85.07%
        
