<a href="https://colab.research.google.com/github/Feven-Belay/Fake-News-Prediction-ML/blob/main/Fake_News_Prediction_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd
import re #used to search text in a document
import nltk # natural language toolkit
from nltk.corpus import stopwords #corpus is the content of the text, stopwords are words that adds no value to the context of the data
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer # convert text to numbers(features)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline



In [5]:
# Downloading NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Load the dataset
news_dataset = pd.read_csv('/content/train.csv')

In [17]:
news_dataset.shape

(20800, 6)

In [18]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,darrell lucus house dem aide even see comey le...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,daniel j flynn flynn hillary clinton big woman...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,consortiumnews com truth might get fired
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,jessica purkiss civilian killed single u airst...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,howard portnoy iranian woman jailed fictional ...


In [7]:
# Handling missing values
news_dataset = news_dataset.fillna('')

In [8]:
# Merging the author name and news title
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']


In [9]:
# Function for text processing
def preprocess_text(content):
    # Remove non-alphabetic characters and lowercase the text
    content = re.sub('[^a-zA-Z]', ' ', content).lower()

    # Tokenize and lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = content.split()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]

    # Join the tokens back into a string
    return ' '.join(lemmatized_tokens)


In [10]:
# Apply text processing to each content
news_dataset['content'] = news_dataset['content'].apply(preprocess_text)

In [11]:

# Separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values


In [12]:
# Splitting the dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [13]:
# Creating a pipeline with TfidfVectorizer and MultinomialNB
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])


In [14]:

# Training the model
pipeline.fit(X_train, Y_train)

In [15]:
# Evaluating the model
train_preds = pipeline.predict(X_train)
test_preds = pipeline.predict(X_test)

print("Training Accuracy: ", accuracy_score(Y_train, train_preds))
print("Test Accuracy: ", accuracy_score(Y_test, test_preds))

Training Accuracy:  0.9958533653846153
Test Accuracy:  0.95625


In [16]:
# Detailed classification report
print("\nClassification Report:\n", classification_report(Y_test, test_preds))


Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96      2077
           1       1.00      0.91      0.95      2083

    accuracy                           0.96      4160
   macro avg       0.96      0.96      0.96      4160
weighted avg       0.96      0.96      0.96      4160

