Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# printing the stopwords in English
print(stopwords.words('English'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Pre-processing

In [4]:
#loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('fake_news_dataset.csv')

In [5]:
news_dataset.shape

(20000, 7)

In [8]:
# print the first 5 rows of the dataframe
news_dataset.head(5)

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake


In [6]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

title          0
text           0
date           0
source      1000
author      1000
category       0
label          0
dtype: int64

In [7]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [8]:
# Now there are no ,missing values as we replaced missing values with empty string
news_dataset.isnull().sum()

title       0
text        0
date        0
source      0
author      0
category    0
label       0
dtype: int64

In [10]:
news_dataset['label'] = news_dataset['label'].replace({'real': 0, 'fake': 1}).astype(int)

In [15]:
# print the first 5 rows of the dataframe
news_dataset.head(5)

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,0
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,1
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,1
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,1
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,1


In [11]:
# merging the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']

In [17]:
print(news_dataset['content'])

0                     Paula George Foreign Democrat final.
1          Joseph Hill To offer down resource great point.
2              Julia Robinson Himself church myself carry.
3                Mr. David Foster DDS You unit its should.
4        Austin Walker Billion believe employee summer ...
                               ...                        
19995                         Gary Miles House party born.
19996    Maria Mcbride Though nation people maybe price...
19997     Kristen Franklin Yet exist with experience unit.
19998                  David Wise School wide itself item.
19999        James Peterson Offer chair cover senior born.
Name: content, Length: 20000, dtype: object


In [12]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [13]:
print(X)
print(Y)

                                       title  \
0                    Foreign Democrat final.   
1        To offer down resource great point.   
2               Himself church myself carry.   
3                       You unit its should.   
4       Billion believe employee summer how.   
...                                      ...   
19995                      House party born.   
19996  Though nation people maybe price box.   
19997        Yet exist with experience unit.   
19998               School wide itself item.   
19999         Offer chair cover senior born.   

                                                    text        date  \
0      more tax development both store agreement lawy...  2023-03-10   
1      probably guess western behind likely next inve...  2022-05-25   
2      them identify forward present success risk sev...  2022-09-01   
3      phone which item yard Republican safe where po...  2023-02-07   
4      wonder myself fact difficult course forget exa...  2023-

Stemming:

Stemming is the process of reducing a word to its Root word

example: actor, actress, acting --> act

In [14]:
port_stem = PorterStemmer()

In [15]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [16]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [18]:
print(news_dataset['content'])

0                      paula georg foreign democrat final
1                   joseph hill offer resourc great point
2                             julia robinson church carri
3                                 mr david foster dd unit
4             austin walker billion believ employe summer
                               ...                       
19995                           gari mile hous parti born
19996    maria mcbride though nation peopl mayb price box
19997              kristen franklin yet exist experi unit
19998                         david wise school wide item
19999         jame peterson offer chair cover senior born
Name: content, Length: 20000, dtype: object


In [19]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [20]:
print(X)

['paula georg foreign democrat final'
 'joseph hill offer resourc great point' 'julia robinson church carri' ...
 'kristen franklin yet exist experi unit' 'david wise school wide item'
 'jame peterson offer chair cover senior born']


In [21]:
print(Y)

[0 1 1 ... 0 1 1]


In [22]:
Y.shape

(20000,)

In [23]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [24]:
print(X)

  (0, 553)	0.39378500362567437
  (0, 729)	0.38563113779991653
  (0, 760)	0.4303325215423282
  (0, 820)	0.4702667357326091
  (0, 1607)	0.5384098017728571
  (1, 855)	0.4075140492253222
  (1, 945)	0.4349807003583625
  (1, 1106)	0.39395595700897607
  (1, 1544)	0.4059231913258755
  (1, 1650)	0.40644929622403225
  (1, 1761)	0.3994347584771229
  (2, 340)	0.4619493340010572
  (2, 402)	0.46849498889312446
  (2, 1116)	0.5803950203356232
  (2, 1802)	0.4798508920150325
  (3, 529)	0.3893432059572742
  (3, 535)	0.46879786583185773
  (3, 765)	0.5331103685464007
  (3, 1474)	0.36176526198760245
  (3, 2168)	0.4621251552704844
  (4, 128)	0.4431492340687908
  (4, 182)	0.39333180511748317
  (4, 209)	0.39858312187055117
  (4, 655)	0.41121899358509134
  (4, 2054)	0.3943487167650092
  :	:
  (19996, 1316)	0.38290410625708515
  (19996, 1345)	0.3396980161532583
  (19996, 1349)	0.44657233844084904
  (19996, 1494)	0.3011404108931225
  (19996, 1617)	0.338246234612171
  (19996, 1677)	0.31677886528565996
  (19996, 21

Splitting the dataset to training & test data

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

Training the Model: Logistic Regression

In [26]:
model = LogisticRegression()

model.fit(X_train, Y_train)

Evaluation :accuracy score

In [34]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [35]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.6490625


In [36]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [37]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.4945


Making a Predictive System

In [38]:
X_new = X_test[3]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


In [39]:
print(Y_test[3])

0


In [34]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import time

In [37]:
# Vectorize with limited features to reduce memory usage
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(news_dataset['content'].values)
Y = news_dataset['label'].values

In [38]:
# Check label distribution for imbalance
print("Label Distribution:")
print(news_dataset['label'].value_counts())

Label Distribution:
label
1    10056
0     9944
Name: count, dtype: int64


In [39]:
# Use subset for faster testing (20% of data)
subset_size = int(0.2 * len(news_dataset))
X_subset = X[:subset_size]
Y_subset = Y[:subset_size]
X_train, X_test, Y_train, Y_test = train_test_split(X_subset, Y_subset, test_size=0.2, stratify=Y_subset, random_state=2)

In [40]:
# Define models with efficient parameters
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=20, n_jobs=-1, class_weight='balanced', random_state=2),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=2)
}

In [42]:
# Train and evaluate each model
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    start_time = time.time()
    
    # Train the model
    model.fit(X_train, Y_train)
    
    # Predictions and accuracy
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    train_accuracy = accuracy_score(Y_train, train_pred)
    test_accuracy = accuracy_score(Y_test, test_pred)
    
    # Print results
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Time Taken: {time.time() - start_time:.2f} seconds")
    print(f"Classification Report:\n{classification_report(Y_test, test_pred)}")


Training RandomForest...
Training Accuracy: 0.8419
Test Accuracy: 0.5112
Time Taken: 0.28 seconds
Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.55      0.53       400
           1       0.51      0.47      0.49       400

    accuracy                           0.51       800
   macro avg       0.51      0.51      0.51       800
weighted avg       0.51      0.51      0.51       800


Training GradientBoosting...
Training Accuracy: 0.7031
Test Accuracy: 0.4850
Time Taken: 1.17 seconds
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.73      0.59       400
           1       0.47      0.24      0.32       400

    accuracy                           0.48       800
   macro avg       0.48      0.48      0.45       800
weighted avg       0.48      0.48      0.45       800

