In [None]:
# ---------- Backend (Model Training and Evaluation) ----------
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

nltk.download('stopwords')

# Load dataset
df = pd.read_csv(r"D:\Chinmay\ML PROJECTS\Fake News Detection\Shuffled_SAMPLE.csv")  # Update with your dataset path
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle dataset

# Text preprocessing
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Tokenize
    text = [ps.stem(word) for word in text if word not in stop_words]  # Stemming and remove stopwords
    return ' '.join(text)

df['cleaned_text'] = df['text'].apply(clean_text)

# Split dataset
X = df['cleaned_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create pipeline with TF-IDF and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.7, min_df=5)),
    ('clf', LogisticRegression())
])

# Hyperparameter tuning
parameters = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'clf__C': [0.1, 1, 10],
    'clf__solver': ['liblinear', 'saga'],
    'clf__max_iter': [500, 1000]
}

grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Accuracy: {grid_search.best_score_:.2f}")

# Save model and vectorizer
joblib.dump(best_model, 'fake_news_model.pkl')

# Evaluation
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Real', 'Fake'], 
            yticklabels=['Real', 'Fake'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.savefig('static/confusion_matrix.png')
plt.close()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig('static/roc_curve.png')
plt.close()

# Metrics Visualization
report = classification_report(y_test, y_pred, output_dict=True)
metrics = ['precision', 'recall', 'f1-score']
classes = ['Real', 'Fake']

plt.figure(figsize=(10,6))
for i, cls in enumerate(classes):
    plt.bar(np.arange(len(metrics)) + i*0.2, 
            [report[cls][metric] for metric in metrics],
            width=0.2, label=cls)

plt.xticks(np.arange(len(metrics)) + 0.2, metrics)
plt.ylabel('Score')
plt.ylim(0.8, 1.0)
plt.title('Classification Metrics')
plt.legend()
plt.savefig('static/metrics.png')
plt.close()

# ---------- Frontend (Flask Application) ----------
from flask import Flask, render_template, request

app = Flask(__name__)

# Load trained model
model = joblib.load('fake_news_model.pkl')

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    if request.method == 'POST':
        text = request.form['news_text']
        cleaned_text = clean_text(text)
        prediction = model.predict([cleaned_text])[0]
        result = 'Fake' if prediction == 1 else 'Real'
        return render_template('result.html', 
                             prediction=result,
                             confusion_matrix='confusion_matrix.png',
                             roc_curve='roc_curve.png',
                             metrics='metrics.png')

if __name__ == '__main__':
    app.run(debug=True)

In [69]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [71]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CHINMAY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [73]:
df = pd.read_csv(r"D:\Chinmay\ML PROJECTS\Fake News Detection\Shuffled_SAMPLE.csv")  # Update with your dataset path
df = df.sample(frac=1).reset_index(drop=True) 

In [75]:
print("Number of NaN values in labels:", df['Label'].isna().sum())

Number of NaN values in labels: 0


In [77]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [79]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Tokenize
    text = [ps.stem(word) for word in text if word not in stop_words]  # Stemming and remove stopwords
    return ' '.join(text)

df['cleaned_text'] = df['Headline'].apply(clean_text)

In [81]:
# Split dataset
X = df['cleaned_text']
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [83]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.7, min_df=5)),
    ('clf', LogisticRegression())
])

In [85]:
parameters = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'clf__C': [0.1, 1, 10],
    'clf__solver': ['liblinear', 'saga'],
    'clf__max_iter': [500, 1000]
}

In [87]:
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Accuracy: {grid_search.best_score_:.2f}")

Best Parameters: {'clf__C': 10, 'clf__max_iter': 500, 'clf__solver': 'liblinear', 'tfidf__ngram_range': (1, 2)}
Best Accuracy: 0.89


In [88]:
joblib.dump(best_model, 'fake_news_model.pkl')


['fake_news_model.pkl']

In [91]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

In [93]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

        FAKE       0.90      0.86      0.88        21
        REAL       0.89      0.92      0.91        26

    accuracy                           0.89        47
   macro avg       0.89      0.89      0.89        47
weighted avg       0.89      0.89      0.89        47



In [95]:
df['Label'] = df['Label'].fillna('REAL')  # First replace NaN with 'REAL'
df['Label'] = df['Label'].map({'REAL': 0, 'FAKE': 1})  # Then convert to binary


In [97]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Real', 'Fake'], 
            yticklabels=['Real', 'Fake'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
# plt.savefig('static/confusion_matrix.png')
plt.close()

In [99]:
# Convert string labels to numerical (0/1)
df['Label'] = df['Label'].map({'REAL': 0, 'FAKE': 1})  # Or whatever your labels are

# Then proceed with your existing code
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [101]:
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

  return x.astype(dtype, copy=copy, casting=casting)


ValueError: Input y_true contains NaN.

In [49]:
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig('static/roc_curve.png')
plt.close()

NameError: name 'fpr' is not defined

<Figure size 640x480 with 0 Axes>

In [None]:
report = classification_report(y_test, y_pred, output_dict=True)
metrics = ['precision', 'recall', 'f1-score']
classes = ['Real', 'Fake']

In [None]:
plt.figure(figsize=(10,6))
for i, cls in enumerate(classes):
    plt.bar(np.arange(len(metrics)) + i*0.2, 
            [report[cls][metric] for metric in metrics],
            width=0.2, label=cls)

In [None]:

plt.xticks(np.arange(len(metrics)) + 0.2, metrics)
plt.ylabel('Score')
plt.ylim(0.8, 1.0)
plt.title('Classification Metrics')
plt.legend()
plt.savefig('static/metrics.png')
plt.close()

In [None]:
# from flask import Flask, render_template, request

# app = Flask(__name__)


In [None]:
model = joblib.load('fake_news_model.pkl')


In [1]:
import pandas as pd
import numpy as np
import string
import re


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix

In [2]:
df = pd.read_csv(r"D:\Chinmay\ML PROJECTS\Fake News Detection\News_Dataset\SAMPLE.csv")  # replace with your file name
df = df[['Headline', 'Label']]  # Adjust if your column names differ
df.dropna(inplace=True)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd4 in position 7963: invalid continuation byte

In [None]:
df['Label'] = df['Label'].map({'FAKE': 0, 'REAL': 1})


In [None]:
print(df['Label'].value_counts())


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
    df['text'] = df['headline_text'].apply(clean_text)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['headline_text'], df['Label'], test_size=0.2, random_state=42, stratify=df['Label'])

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

In [None]:
y_pred = model.predict(X_test_vec)


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
def predict_news(news_text):
    news_tfidf = vectorizer.transform([news_text])
    prediction = model.predict(news_tfidf)
    return "TRUE" if prediction[0] == 1 else "FALSE"



In [None]:
print(predict_news("India Launches 5G Services Nationwide with Focus on Rural Connectivity"))

print(predict_news("India acheives zero Poverty"))
