In [1]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
datafake = pd.read_csv('Fake.csv')
datatrue = pd.read_csv('True.csv')

# Assign class labels
datafake["class"] = 0
datatrue["class"] = 1

# Remove last 10 rows from datafake and datatrue for manual testing
datafake_manual_testing = datafake.tail(10)
datafake.drop(datafake.index[-10:], inplace=True)

datatrue_manual_testing = datatrue.tail(10)
datatrue.drop(datatrue.index[-10:], inplace=True)

datafake_manual_testing['class'] = 0
datatrue_manual_testing['class'] = 1

# Merge the datasets
data_merge = pd.concat([datafake, datatrue], axis=0)
data_merge.head(5)
data_merge.columns

# Drop unnecessary columns and reset index
data = data_merge.drop(['title', 'subject', 'date'], axis=1)
data.reset_index(inplace=True)
data.drop(['index'], axis=1, inplace=True)

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub('\[.*?]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

data['text'] = data['text'].apply(preprocess_text)

# Vectorize the text using TfidfVectorizer
vectorization = TfidfVectorizer()
x = vectorization.fit_transform(data['text'])

# Split the data
y = data['class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Train the Logistic Regression model
LR = LogisticRegression()
LR.fit(x_train, y_train)
pred_lr = LR.predict(x_test)

# Preprocess true news for similarity checking using the same vectorizer
datatrue['text'] = datatrue['text'].apply(preprocess_text)
true_news_vectorized = vectorization.transform(datatrue['text'])

def output_label(n):
    return "This news is Fake" if n == 0 else "This is not a fake news"

def find_related_true_news(fake_news_vector):
    similarities = cosine_similarity(fake_news_vector, true_news_vectorized)
    most_similar_idx = similarities.argmax()
    return datatrue.iloc[most_similar_idx]['text']

def manual_testing(news):
    news_preprocessed = preprocess_text(news)
    news_vectorized = vectorization.transform([news_preprocessed])
    pred_LR = LR.predict(news_vectorized)
    
    if pred_LR[0] == 0:
        related_true_news = find_related_true_news(news_vectorized)
        result = "This news is Fake"
        related_news = f"\nRelated true news:\n{related_true_news}"
    else:
        result = "This is not a fake news"
        related_news = ""
    
    return print(f"\n\nLR Prediction: {result}{related_news}")

# Test with an input news


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datafake_manual_testing['class'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datatrue_manual_testing['class'] = 1


In [4]:
news = str(input("Enter news text: "))
manual_testing(news)


Enter news text:  A U.S. appeals court on Friday said President Donald Trump's hotly contested travel ban targeting people from six Muslim-majority countries should not be applied to people with strong U.S. ties




LR Prediction: This is not a fake news
