Import libraries

In [1]:
import pandas as pd
import numpy as np
import joblib

Load Data

In [2]:
Liar_data =  pd.read_csv(r'C:\Users\Hp\Documents\Fake News Detection\LIAR plus dataset\LIAR_Cleaned_data.csv')
news_data =  pd.read_csv(r'C:\Users\Hp\Documents\Fake News Detection\LIAR plus dataset\News_Cleaned_data.csv')

Concat Datasets

In [3]:
# Combine datasets into one frame
df = pd.concat([Liar_data, news_data], ignore_index=True)

# Print the first few rows to verify
print(df.head())

                                                text  target
0  say the annies list political group support th...       0
1  when did the decline of coal start it started ...       1
2  hillary clinton agrees with john mccain by vot...       1
3  health care reform legislation is likely to ma...       0
4  the economic turnaround started at the end of ...       1


In [4]:
# Save df to a CSV file
df.to_csv('Combined_Cleaned_data.csv', index=False)

Get shape of data

In [5]:
df.shape

(57478, 2)

Check Missing Values

In [6]:
df.isnull().sum()

text      0
target    0
dtype: int64

In [7]:
X = df['text'].values
y = df['target'].values

In [8]:
print(X)

['say the annies list political group support thirdtrimester abortion on demand'
 'when did the decline of coal start it started when natural gas took off that started to begin in president george w bush administration'
 'hillary clinton agrees with john mccain by voting to give george bush the benefit of the doubt on iran'
 ...
 "['washington', 'reuter', 'presid', 'donald', 'trump', 'told', 'chief', 'execut', 'major', 'us', 'compani', 'thursday', 'plan', 'bring', 'million', 'job', 'back', 'unit', 'state', 'offer', 'specif', 'plan', 'revers', 'decadeslong', 'declin', 'factori', 'job', 'first', 'month', 'offic', 'trump', 'pressur', 'number', 'us', 'compani', 'hire', 'unit', 'state', 'yet', 'publicli', 'propos', 'legisl', 'tackl', 'big', 'econom', 'issu', 'campaign', '2016', 'includ', 'jobboost', 'tax', 'infrastructur', 'program', 'address', 'joint', 'session', 'congress', 'feb', '28', 'meet', 'two', 'dozen', 'ceo', 'white', 'hous', 'trump', 'said', 'unit', 'state', 'lost', 'onethird', '

 Feature Extraction Using TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(X)
X = tfidf_vectorizer.transform(X)

In [10]:
print(X)

  (0, 14475)	0.33016759132341994
  (0, 21704)	0.5437185742014611
  (0, 54981)	0.18967207907990202
  (0, 79294)	0.13309510708420363
  (0, 108629)	0.19189251263682183
  (0, 130064)	0.2112883974104716
  (0, 143683)	0.382046233914163
  (0, 162434)	0.09303800233528253
  (0, 176970)	0.1183412133512721
  (0, 181218)	0.12981053346936736
  (0, 182592)	0.5241501056050897
  (1, 16169)	0.2154471787802618
  (1, 29506)	0.12733055690817524
  (1, 36503)	0.14112368771493783
  (1, 44704)	0.1856266637980225
  (1, 54283)	0.3016875382466613
  (1, 56654)	0.20674466731180244
  (1, 75341)	0.2370886463837897
  (1, 76036)	0.2349603072300624
  (1, 92735)	0.10779310746830024
  (1, 96956)	0.11685153320687905
  (1, 122799)	0.2804279136657308
  (1, 128916)	0.11256845892985262
  (1, 128936)	0.21726896124456221
  (1, 146107)	0.17339003945636364
  :	:
  (57477, 165403)	0.040382986073456556
  (57477, 166207)	0.14146979531567594
  (57477, 166803)	0.05350291864987622
  (57477, 168357)	0.03580883761948761
  (57477, 168652)

Logistic Model

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 2: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train Logistic Regression Model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

train_y_pred =  model.predict(X_train)
print("Train Accuracy: ", accuracy_score(train_y_pred, y_train))


test_y_pred =  model.predict(X_test)
print("Test Accuracy: ", accuracy_score(test_y_pred, y_test))

print("\nClassification Report:")
print(classification_report(y_test, test_y_pred))

Train Accuracy:  0.9295593928058805
Test Accuracy:  0.9020528879610299

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      5812
           1       0.90      0.90      0.90      5684

    accuracy                           0.90     11496
   macro avg       0.90      0.90      0.90     11496
weighted avg       0.90      0.90      0.90     11496



In [12]:
X_train.shape

(45982, 205145)

In [13]:
X_test.shape

(11496, 205145)

In [14]:
# Save the model and vectorizer
joblib.dump(model, 'fake_news_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [15]:
# Prediction Function
def predict_fake_news(input_text):
    """Predict if the given text is fake or real news."""
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
    loaded_model = joblib.load('fake_news_model.pkl')
    
    # Transform the input text
    input_vectorized = vectorizer.transform([input_text])
    prediction = loaded_model.predict(input_vectorized)
    
    return "Fake News" if prediction[0] == 1 else "Real News"