In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
#Downloading stop words
import nltk
nltk.download('stopwords')



In [None]:
print(stopwords.words('english'))

In [None]:
from google.colab import files
import pandas as pd

# Upload the files
uploaded = files.upload()

In [None]:
uploaded = files.upload()

Data Pre Processing

In [None]:
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')


In [None]:
fake["label"] = 0   # 0 = Fake
true["label"] = 1   # 1 = Real


In [None]:
df = pd.concat([fake , true] , axis = 0).reset_index(drop=True)

In [None]:
df.head()

In [None]:
df['label'].value_counts()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
#counting the no of missing values
df.isnull().sum()

In [None]:
df["content"] = df["title"] + " " + df["text"]

In [None]:
#separating the data
X = df.drop(columns = 'label' , axis = 1)
y = df['label']

In [None]:
X

In [None]:
y

Stemming : is the process of reducing a word to its root word

In [None]:
port_stem = PorterStemmer()

In [None]:
port_stem = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Precompile regex to keep only letters
letters_only = re.compile('[^a-zA-Z]')

def stemming_fast_vectorized(text):
    # 1. Keep only letters and lowercase
    text = letters_only.sub(' ', text).lower()

    # 2. Split, remove stopwords, stem
    words = [port_stem.stem(word) for word in text.split() if word not in stop_words]

    return ' '.join(words)

In [None]:
df['content'] = df['content'].apply(stemming_fast_vectorized)

In [None]:
# Fake news
fake_text = " ".join(df[df['label']==0]['content'].values)
fake_wc = WordCloud(width=800, height=400, background_color='white').generate(fake_text)

plt.figure(figsize=(15,7))
plt.imshow(fake_wc, interpolation='bilinear')
plt.axis('off')
plt.title("Fake News Word Cloud")
plt.show()



In [None]:

# Real news
real_text = " ".join(df[df['label']==1]['content'].values)
real_wc = WordCloud(width=800, height=400, background_color='white').generate(real_text)

plt.figure(figsize=(15,7))
plt.imshow(real_wc, interpolation='bilinear')
plt.axis('off')
plt.title("Real News Word Cloud")
plt.show()

In [None]:
print(df['content'])

In [None]:
X = df['content'].values
y = df['label'].values

In [None]:
#converting  the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [None]:
print(X)

In [None]:
#Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train , y_train)

Evaluation


In [None]:
#accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction , y_train)

In [None]:
print("accuracy score of training data : ", training_data_accuracy )

In [None]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction , y_test)

In [None]:
print("accuracy score of test data : ", test_data_accuracy )


In [None]:




# Define models with fixed/default parameters
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, C=1, solver='liblinear'),
    "Multinomial NB": MultinomialNB(alpha=1.0)
}



results = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # ROC-AUC
    y_prob = model.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_prob)

    results[name] = {"Accuracy": acc, "AUC": auc}

    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.2f})")

# ROC curve styling
plt.plot([0,1], [0,1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()

import pprint
pprint.pprint(results)
