In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, classification_report
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import plotly.express as px
import string
import warnings
warnings.filterwarnings('ignore')

Load the Dataset

In [None]:
data = pd.read_csv("C:\\Users\\HP\\Downloads\\Datasets\\WELFake_Dataset.csv")
data.head()

In [None]:
data['text'][0]

In [None]:
data.drop('Unnamed: 0' , axis =1, inplace = True)

In [None]:
data.info()

In [None]:
data.isnull().sum()

Handle Null Values and Feature Seperation

In [None]:
data.fillna(' ', inplace = True)
features = data[['title','text']]
labels = data['label']

In [None]:
data.isnull().sum()

In [None]:
label_Status = data['label'].value_counts()
transactions = label_Status.index
quantity = label_Status.values

figure = px.pie(data, 
             values=quantity, 
             names=transactions, hole =.60, 
             title="Spam & Ham Status")
figure.show()

Create Word Clouds

In [None]:
def plot_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)

fake_text = ' '.join(data[data['label'] == 0]['text'])
real_text = ' '.join(data[data['label'] == 1]['text'])

In [None]:
plot_wordcloud(fake_text,'Fake News')
plt.show()

In [None]:
plot_wordcloud(real_text,'Real News')
plt.show()

Text Preprocessing

In [None]:
text_column = 'text'
label_column = 'label'

In [None]:
import nltk
nltk.download('stopwords')
stopword = set(stopwords.words('english'))
stopword

In [None]:
def preprocess_text(text):
    # Remove punctuation
    remove_punc = [char for char in text if char not in string.punctuation]
    clean_words = ''.join(remove_punc) # char joining
    
    #Remove stopwords
    text = ([word for word in clean_words.split() if word.lower() not in stopword]) # stopword = stopwords.words('english')
    return text

In [None]:
data[text_column] = data[text_column].apply(preprocess_text)

In [None]:
data[text_column]

Lemmatization

In [None]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text])
    return lemmatized_text

data[text_column] = data[text_column].apply(lemmatize_text)

In [None]:
data[text_column]

TF/Df Vectorization

In [None]:
vectorizer = TfidfVectorizer()

x = vectorizer.fit_transform(data[text_column])
y = data[label_column]

In [None]:
x

Split dataset

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,labels,test_size=0.3,random_state=42)

Evaluate Naive Bias

In [None]:
def evaluate_model(model, xtest, ytest):
    y_pred = model.predict(xtest)
    accuracy = accuracy_score(ytest, y_pred)
    cm = confusion_matrix(ytest, y_pred)
    prob = model.predict_proba(xtest)[:, 1]
    roc_auc = roc_auc_score(ytest, prob)
    fpr, tpr, thresholds = roc_curve(ytest, prob)
    precision, recall, _ = precision_recall_curve(ytest, prob)
    pr_auc = auc(recall, precision)
    
    return {
        'Accuracy': accuracy,
        'Confusion Matrix': cm,
    }

Build Naive Bias Models

In [None]:
mnb_model = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
mnb_model.fit(xtrain, ytrain)

In [None]:
from sklearn.metrics import precision_recall_curve, auc

In [None]:
nb_results = evaluate_model(mnb_model, xtest, ytest)
nb_results

Apply Logistic Regression

In [None]:
lr_model = LogisticRegression(max_iter=1000)

In [None]:
lr_model.fit(xtrain,ytrain)

Evaluate Logistic Regression

In [None]:
lr_results = evaluate_model(lr_model,xtest,ytest)
lr_results

Visualization

In [None]:
def plot_histogram(metric_values, model_names, metric_name):
    fig, ax = plt.subplots(figsize=(10, 5))
    bars = plt.bar(model_names, metric_values)
    plt.xlabel('Models')
    plt.ylabel(metric_name)
    plt.title(f'{metric_name} for Different Models')
    
    for bar in bars:
        yval = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 3), ha='center', va='bottom', color='black', fontweight='bold')

    plt.show()

accuracy_values = [nb_results['Accuracy'], lr_results['Accuracy']]
model_names = ['Multinomial Naive Bayes', 'Logistic Regression']
plot_histogram(accuracy_values, model_names, 'Accuracy')

OutPut Random Text to Make Prediction

In [None]:
models = [mnb_model, lr_model]

In [None]:
random_text = input()

preprocessed_text = preprocess_text(random_text) # remove punctuation 
lemmatized_text = lemmatize_text(preprocessed_text) # text scaling
text_vector = vectorizer.transform([lemmatized_text])

In [None]:
preprocessed_text

In [None]:
lemmatized_text

In [None]:
text_vector

In [None]:
text_vector.toarray()

In [None]:
for model in models:
    prediction = model.predict(text_vector)
    print(f"Model: {type(model).__name__}")
    print("Prediction:", prediction)
    print('\n')

Saving the Model

In [None]:
import  pickle

model_file_path = "C:\\Users\\HP\\Desktop\\Fake_News_detection_NLP\\FND.pkl"

# Save the model to the file
with open(model_file_path, 'wb') as model_file:
    pickle.dump(lr_model, model_file)

Load Model

In [None]:
with open(model_file_path, 'rb') as model_file:
    loaded_LR = pickle.load(model_file)