In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import os
from joblib import Parallel, delayed

In [19]:
# Ensure NLTK data is downloaded
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)


True

In [20]:
# Load the true and false datasets
true_data = pd.read_csv('C:\\Users\\HP\\OneDrive\\Desktop\\Mchine Learning Projects\\Fake_News_Detection\\data\\True.csv')
false_data = pd.read_csv('C:\\Users\\HP\\OneDrive\\Desktop\\Mchine Learning Projects\\Fake_News_Detection\\data\\Fake.csv')


In [21]:
# Add labels: 1 for true, 0 for false
true_data['label'] = 1
false_data['label'] = 0


In [22]:
# Combine the datasets
data = pd.concat([true_data, false_data], ignore_index=True)


In [23]:
# Combine title and text for preprocessing
data['text'] = data['title'] + " " + data['text']


In [24]:
# Preprocess text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Use parallel processing to speed up preprocessing
data['clean_text'] = Parallel(n_jobs=-1)(delayed(preprocess)(text) for text in data['text'])


In [26]:
# Feature extraction using TF-IDF with bigrams
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(data['clean_text'])
y = data['label']


In [27]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
# Hyperparameter tuning using Grid Search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # 'liblinear' supports both 'l1' and 'l2' penalties
}

log_reg = LogisticRegression()

grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [29]:
# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)


Best Parameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}


In [30]:
# Evaluate the best model
print("Best Model Accuracy on Training Data:", best_model.score(X_train, y_train) * 100)
print("Best Model Accuracy on Test Data:", best_model.score(X_test, y_test) * 100)

y_pred = best_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred) * 100)
print(classification_report(y_test, y_pred))



Best Model Accuracy on Training Data: 100.0
Best Model Accuracy on Test Data: 99.6881959910913
Accuracy: 99.6881959910913
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4650
           1       1.00      1.00      1.00      4330

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [38]:
# y_pred = best_model.predict(input())