In [15]:
#Install Dependencies
!pip install nltk



In [16]:
#CELL 2 â€” Import Libraries
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity


In [17]:
#CELL 3 â€” Download NLTK Resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to C:\Users\AAQUIB
[nltk_data]     AFTAB\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\AAQUIB
[nltk_data]     AFTAB\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\AAQUIB
[nltk_data]     AFTAB\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [18]:
#CELL 4 â€” Load Dataset
true_df = pd.read_csv("True.csv")
fake_df = pd.read_csv("Fake.csv")

true_df['label'] = 1
fake_df['label'] = 0

df = pd.concat([true_df, fake_df], axis=0).reset_index(drop=True)
df.head()


Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [19]:
#CELL 5 â€” Remove Duplicates & Null Rows
df.drop_duplicates(subset="text", inplace=True)
df.dropna(subset=["text"], inplace=True)

df.shape


(38646, 5)

In [20]:
#CELL 6 â€” Cleaning Function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)        # Remove URLs
    text = re.sub(r'\d+', '', text)                          # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return " ".join(words)


In [22]:
#CELL 7 â€” Apply Cleaning
df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text']].head()


Unnamed: 0,text,clean_text
0,WASHINGTON (Reuters) - The head of a conservat...,washington reuters head conservative republica...
1,WASHINGTON (Reuters) - Transgender people will...,washington reuters transgender people allowed ...
2,WASHINGTON (Reuters) - The special counsel inv...,washington reuters special counsel investigati...
3,WASHINGTON (Reuters) - Trump campaign adviser ...,washington reuters trump campaign adviser geor...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,seattlewashington reuters president donald tru...


In [23]:
#CELL 8 â€” Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42
)


In [24]:
#CELL 9 â€” TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=50000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [25]:
#CELL 10 â€” Train Logistic Regression Model
model = LogisticRegression(max_iter=2000)
model.fit(X_train_tfidf, y_train)


In [30]:
from sklearn.metrics import accuracy_score, f1_score

# Predictions
y_pred = model.predict(X_test_tfidf)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

# F1 Score (binary classification)
f1 = f1_score(y_test, y_pred)

print("ðŸ“Š Model Evaluation Metrics")
print("------------------------------")
print("âœ” Accuracy Score :", accuracy)
print("âœ” F1 Score       :", f1)


ðŸ“Š Model Evaluation Metrics
------------------------------
âœ” Accuracy Score : 0.9855109961190168
âœ” F1 Score       : 0.986882173811197


In [31]:
# CELL 11 â€” Evaluate Model
pred = model.predict(X_test_tfidf)
print("Model Accuracy:", accuracy_score(y_test, pred))


Model Accuracy: 0.9855109961190168


In [27]:
#CELL 12 â€” Prediction Function
def predict_news(user_news):
    cleaned = clean_text(user_news)
    vec = tfidf.transform([cleaned])
    prediction = model.predict(vec)[0]
    
    if prediction == 1:
        print("\nðŸŸ¢ The news is MOST LIKELY TRUE.\n")
    else:
        print("\nðŸ”´ The news is MOST LIKELY FAKE.\n")
    
    # Show similar true news
    print("Similar TRUE news articles:\n")
    
    true_texts = df[df.label == 1]['clean_text'].tolist()
    true_vecs = tfidf.transform(true_texts)
    
    scores = cosine_similarity(vec, true_vecs)[0]
    top_idx = scores.argsort()[-5:][::-1]
    
    for i, idx in enumerate(top_idx):
        print(f"{i+1}. {true_texts[idx][:300]}...")
        print("-" * 80)


In [29]:
#CELL 13 â€” User Input
news = input("Enter a news article:\n\n")
predict_news(news)


Enter a news article:

 NASA confirms that an asteroid will hit Earth next week causing mass destruction.



ðŸ”´ The news is MOST LIKELY FAKE.

Similar TRUE news articles:

1. washington reuters u president barack obama highlighting pledge send people mar tuesday announced publicprivate effort build habitat could help human live longterm far earth â€œwe working commercial partner build new habitat sustain transport astronaut longduration mission deep space mission teach u h...
--------------------------------------------------------------------------------
2. reuters trio u japanese russian astronaut arrived international space station tuesday nasa tv broadcast showed commander anton shkaplerov roscosmos flight engineer norishige kanai japan aerospace exploration agency scott tingle nasa docked soyuz spacecraft mile km earth gmt docking completes twoday ...
--------------------------------------------------------------------------------
3. nairobi reuters deadlock allocation leadership post forced kenya parliament unscheduled threeweek recess lawmaker said wednesday indicating protracted po