In [1]:
!pip install nltk scikit-learn pandas
import nltk
nltk.download('stopwords')
nltk.download('wordnet')




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
from google.colab import files

uploaded = files.upload()


Saving True.csv to True.csv
Saving Fake.csv to Fake.csv


In [3]:
import os
os.listdir()


['.config', 'Fake.csv', 'True.csv', 'sample_data']

In [4]:
import pandas as pd

fake = pd.read_csv('/content/Fake.csv')[['title', 'text']]
fake['label'] = 0

real = pd.read_csv('/content/True.csv')[['title', 'text']]
real['label'] = 1

df = pd.concat([fake, real], ignore_index=True)
df.dropna(inplace=True)

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', str(text).lower())
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

df['cleaned_text'] = df['text'].apply(clean_text)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4733
           1       0.98      0.99      0.99      4247

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [6]:
import pickle

with open('/content/news_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('/content/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


In [9]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_input(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Sample input
sample_text = "The Reserve Bank of India has announced a 0.25% cut in the repo rate."

# Preprocess
cleaned = clean_input(sample_text)

# Vectorize
vec = vectorizer.transform([cleaned])

# Predict
prediction = model.predict(vec)[0]

# Output result
print("Prediction (1=Real, 0=Fake):", prediction)


Prediction (1=Real, 0=Fake): 0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from google.colab import files
files.download('/content/news_model.pkl')
files.download('/content/vectorizer.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load data
df_fake = pd.read_csv("Fake.csv")
df_real = pd.read_csv("True.csv")

df_fake['label'] = 0
df_real['label'] = 1

df = pd.concat([df_fake[['title', 'text', 'label']], df_real[['title', 'text', 'label']]])
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle

# Clean text
def clean_text(text):
    text = str(text)
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)

df['text'] = df['title'] + " " + df['text']  # Combine title and text
df['cleaned_text'] = df['text'].apply(clean_text)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=10000)  # increased features
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save model and vectorizer
pickle.dump(model, open("news_model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.9874164810690423
F1 Score: 0.9868190831680859
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4704
           1       0.98      0.99      0.99      4276

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [12]:
print("Original:", sample)
print("Cleaned :", cleaned)


Original: The Reserve Bank of India has announced a 0.25% cut in the repo rate.
Cleaned : reserve bank india announced cut repo rate


In [13]:
print("Vector shape:", vec.shape)
print("Non-zero entries in vector:", vec.nnz)


Vector shape: (1, 10000)
Non-zero entries in vector: 6


In [14]:
sample = "RBI Announcement: The Reserve Bank of India has announced a 0.25% cut in the repo rate."


In [15]:
# Try with actual real data from training
real_sample = df[df['label'] == 1].iloc[0]['text']
cleaned = clean_input(real_sample)
vec = vectorizer.transform([cleaned])
print("Prediction (1=Real, 0=Fake):", model.predict(vec)[0])


Prediction (1=Real, 0=Fake): 1


In [16]:
import pickle

pickle.dump(model, open("news_model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))


In [19]:
from google.colab import files

files.download("news_model.pkl")
files.download("vectorizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>