In [2]:
import pandas as pd
fake_df = pd.read_csv("Fake.csv")
true_df = pd.read_csv("True.csv")


In [3]:
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
fake_df.shape

(23481, 4)

In [5]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [6]:
true_df.shape

(21417, 4)

In [7]:
fake_df["label"] = 1   # fake
true_df["label"] = 0   # true

In [8]:
data = pd.concat([fake_df, true_df], axis=0)
data.head()



Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [9]:
# Combine relevant columns
data['Document'] = data['subject'] + " " + data['title'] + " " + data['text']

In [10]:
data = data.drop(columns=['date'])

In [11]:
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (only once)
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove punctuation / special characters
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. contractions 
    text = contractions.fix(text)   # <-- handle contractions here can’t to cannot
    
    # 4. Remove stopwords
    words = [word for word in text.split() if word not in stop_words]
    
    # 5. Remove Short Words
    words = [word for word in words if len(word) >= 3]  # remove short words

    # 4. Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join words back to string
    return ' '.join(words)

# Apply to your column
data['Document_Cleaned'] = data['Document'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to C:\Users\Faisal
[nltk_data]     Zamir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Faisal
[nltk_data]     Zamir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Suppose you have cleaned text in df['Cleaned_Text']
tfidf = TfidfVectorizer(max_features=5000)  # you can change features
X_tfidf = tfidf.fit_transform(data['Document_Cleaned'])

# Get feature names (vocabulary)
words = tfidf.get_feature_names_out()

print("TF-IDF shape:", X_tfidf.shape)


TF-IDF shape: (44898, 5000)


In [13]:
import pandas as pd

# Get feature names (words)
feature_names = tfidf.get_feature_names_out()

# Convert to DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=feature_names)

# See top rows
tfidf_df.tail()


Unnamed: 0,abandon,abandoned,abbas,abc,abdullah,abe,abedin,ability,able,abortion,...,younger,youth,youtube,ypg,zealand,zero,zika,zimbabwe,zone,zuma
44893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44895,0.0,0.102708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
len(tfidf.get_feature_names_out()) # Check TF-IDF vocabulary size
X_tfidf.nnz # It will show Check non-zero elements
first_doc = X_tfidf[0]
# print(first_doc)


In [15]:
# data = data.drop(columns=['Document', 'subject','text','title'])
data.columns


Index(['title', 'text', 'subject', 'label', 'Document', 'Document_Cleaned'], dtype='object')

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# X = TF-IDF matrix (features)
X = X_tfidf

# y = labels (target)
y = data['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [17]:
# Train a model (e.g., Logistic Regression)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)




0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [18]:
# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9910913140311804
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4247
           1       0.99      0.99      0.99      4733

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy  = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall    = recall_score(y_test, y_pred)
f1        = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

import json

metrics = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1
}

with open("fake_news_metrics.json", "w") as f:
    json.dump(metrics, f)


Accuracy: 0.9910913140311804
Precision: 0.9915487006127192
Recall: 0.9915487006127192
F1 Score: 0.9915487006127192


In [21]:
import joblib
# Save all necessary components
joblib.dump(model, "fake_news_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [22]:
model = joblib.load("fake_news_model.pkl")
tfidf = joblib.load("tfidf_vectorizer.pkl")


In [23]:
import joblib
import re
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize preprocessing components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """Preprocessing function identical to training"""
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove punctuation / special characters
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. Handle contractions
    text = contractions.fix(text)
    
    # 4. Remove stopwords
    words = [word for word in text.split() if word not in stop_words]
    
    # 5. Remove Short Words
    words = [word for word in words if len(word) >= 3]
    
    # 6. Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join words back to string
    return ' '.join(words)

def predict_single_news(text):
    """
    Predict if a single news text is fake or real
    
    Parameters:
    text (str): The news text to classify
    
    Returns:
    dict: Prediction results
    """
    
    # Preprocess the text
    cleaned_text = preprocess_text(text)
    
    # Transform using the loaded TF-IDF vectorizer
    text_tfidf = tfidf.transform([cleaned_text])
    
    # Make prediction
    prediction = model.predict(text_tfidf)[0]
    
    # Get probability scores
    try:
        prediction_proba = model.predict_proba(text_tfidf)[0]
        fake_prob = prediction_proba[1]  # Probability of being fake
        true_prob = prediction_proba[0]  # Probability of being true
        confidence = max(prediction_proba)
    except:
        fake_prob = "N/A"
        true_prob = "N/A"
        confidence = "N/A"
    
    # Determine result
    if prediction == 1:
        result = "FAKE NEWS"
        is_fake = True
    else:
        result = "REAL NEWS" 
        is_fake = False
    
    return {
        'prediction': int(prediction),
        'prediction_label': result,
        'is_fake': is_fake,
        'fake_probability': fake_prob,
        'true_probability': true_prob,
        'confidence': confidence,
    }

# Test with a single news text
test_news = "Technology company Apple Inc. announced its quarterly earnings today, reporting a 12% increase in revenue compared to the same period last year. The growth was driven by strong sales of their latest smartphone model."

# Make prediction
result = predict_single_news(test_news)

# Display results
print("🔍 FAKE NEWS DETECTION RESULT")
print("=" * 50)
print(f"Input Text: {test_news}")
print(f"Prediction: {result['prediction']} ({result['prediction_label']})")
print(f"Is Fake: {result['is_fake']}")
if result['confidence'] != "N/A":
    print(f"Confidence: {result['confidence']:.4f}")
    print(f"Fake Probability: {result['fake_probability']:.4f}")
    print(f"Real Probability: {result['true_probability']:.4f}")


# Simple boolean check
if result['is_fake']:
    print("🚨 WARNING: This appears to be FAKE NEWS!")
else:
    print("✅ This appears to be REAL NEWS!")

🔍 FAKE NEWS DETECTION RESULT
Input Text: Technology company Apple Inc. announced its quarterly earnings today, reporting a 12% increase in revenue compared to the same period last year. The growth was driven by strong sales of their latest smartphone model.
Prediction: 0 (REAL NEWS)
Is Fake: False
Confidence: 0.5976
Fake Probability: 0.4024
Real Probability: 0.5976
✅ This appears to be REAL NEWS!


In [None]:
# User will give only news and then it will predict everything
