In [4]:
import pandas as pd

# Load datasets safely
fake = pd.read_csv("Fake.csv", engine="python", encoding="utf-8", on_bad_lines="skip")
true = pd.read_csv("True.csv", engine="python", encoding="utf-8", on_bad_lines="skip")

# Add labels
fake["label"] = 0   # Fake news
true["label"] = 1   # Real news

# Combine datasets
data = pd.concat([fake, true], axis=0)

# Shuffle
data = data.sample(frac=1).reset_index(drop=True)

# Keep required columns
data = data[["text", "label"]]

# Save final dataset
data.to_csv("news.csv", index=False)

print("Final dataset size:", data.shape)


Final dataset size: (14473, 2)


In [5]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
import pandas as pd

data = pd.read_csv("news.csv")
print(data.head())


                                                text  label
0  Donald Trump has a reputation in the media for...      0
1  BEDMINSTER, N.J. (Reuters) - U.S. President Do...      1
2  WASHINGTON (Reuters) - The Trump administratio...      1
3  NEW YORK (Reuters) - More Republicans now thin...      1
4  As the presidential race heats up and the coff...      0


In [8]:
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)

    words = text.split()
    words = [word for word in words if word not in stop_words]

    return " ".join(words)


In [9]:
data["text"] = data["text"].apply(clean_text)
print(data.head())


                                                text  label
0  donald trump reputation media open accessible ...      0
1  bedminster nj reuters us president donald trum...      1
2  washington reuters trump administration top re...      1
3  new york reuters republicans think democrat hi...      1
4  presidential race heats coffers start pour gen...      0


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data["text"]).toarray()
y = data["label"]


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [12]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


In [13]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_nb = nb_model.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.9630397236614853
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1305
           1       0.96      0.97      0.97      1590

    accuracy                           0.96      2895
   macro avg       0.96      0.96      0.96      2895
weighted avg       0.96      0.96      0.96      2895



In [14]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)


In [15]:
y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.9951640759930915
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1305
           1       0.99      1.00      1.00      1590

    accuracy                           1.00      2895
   macro avg       1.00      0.99      1.00      2895
weighted avg       1.00      1.00      1.00      2895



In [16]:
import pickle

# Save Logistic Regression model
with open("fake_news_model.pkl", "wb") as f:
    pickle.dump(lr_model, f)

# Save TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)


In [17]:
def predict_news(news_text):
    news_text = clean_text(news_text)
    vector = vectorizer.transform([news_text])
    prediction = lr_model.predict(vector)
    return "Real News" if prediction[0] == 1 else "Fake News"


# Example test
sample_news = "Government announces new AI policy for education sector"
print(predict_news(sample_news))


Real News
