In [1]:
%pip install numpy pandas scikit-learn nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/usr/local/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Imports

In [2]:
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.sparse import issparse

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/Abhi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Pre-Processing

In [5]:
import sys
import csv
csv.field_size_limit(sys.maxsize)

# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/Users/Abhi/fake-news-ml/articles_dataset.csv', engine='python', on_bad_lines='skip')

In [6]:
news_dataset.shape

(7000, 4)

In [7]:
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [8]:
# counting missing values in dataset
news_dataset.isnull().sum()

Unnamed: 0     0
title         49
text           6
label          0
dtype: int64

In [9]:
# replacing the null values with empty string
nuws_dataset = news_dataset.fillna('')

In [10]:
# merging the title and text
news_dataset['content'] = news_dataset['title']+' '+news_dataset['text']

In [11]:
print (news_dataset['content'])

0       LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1                                                     NaN
2       UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3       Bobby Jindal, raised Hindu, uses story of Chri...
4       SATAN 2: Russia unvelis an image of its terrif...
                              ...                        
6995    Abbas says Jerusalem is eternal Palestinian ca...
6996    In a Defiant, Angry Speech, Donald Trump Defen...
6997                                                  NaN
6998    WATCH TUCKER CARLSON Face Off With New York Ti...
6999    US Will Never Separate its Fighters from Isla...
Name: content, Length: 7000, dtype: object


In [12]:
# removing the label from the dataset
x = news_dataset.drop(columns='label', axis=1)
y = news_dataset['label']

In [13]:
print(x)

      Unnamed: 0                                              title  \
0              0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1              1                                                NaN   
2              2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3              3  Bobby Jindal, raised Hindu, uses story of Chri...   
4              4  SATAN 2: Russia unvelis an image of its terrif...   
...          ...                                                ...   
6995        6995  Abbas says Jerusalem is eternal Palestinian ca...   
6996        6996  In a Defiant, Angry Speech, Donald Trump Defen...   
6997        6997  Les Américains ne sont plus qu’à quelques heur...   
6998        6998  WATCH TUCKER CARLSON Face Off With New York Ti...   
6999        6999  US Will Never Separate its Fighters from Isla...   

                                                   text  \
0     No comment is expected from Barack Obama Membe...   
1        Did they post their 

In [14]:
print(y)

0       1
1       1
2       1
3       0
4       1
       ..
6995    0
6996    0
6997    1
6998    1
6999    1
Name: label, Length: 7000, dtype: int64


In [16]:
x.shape

(7000, 4)

In [17]:
y.shape

(7000,)

Stemming - process of reducing a word to its root

In [18]:
port_stem = PorterStemmer()

In [19]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [20]:
# Apply stemming to the entire 'content' column
news_dataset['content'] = news_dataset['content'].fillna('').apply(stemming)

In [21]:
print(news_dataset['content'])

0       law enforc high alert follow threat cop white ...
1                                                        
2       unbeliev obama attorney gener say charlott rio...
3       bobbi jindal rais hindu use stori christian co...
4       satan russia unv imag terrifi new supernuk wes...
                              ...                        
6995    abba say jerusalem etern palestinian capit dis...
6996    defiant angri speech donald trump defend imag ...
6997                                                     
6998    watch tucker carlson face new york time editor...
6999    us never separ fighter islamist depend us neve...
Name: content, Length: 7000, dtype: object


In [23]:
# Calculate the size of the first 1/7 of the dataset
x = news_dataset['content'].values
y = news_dataset['label'].values

In [24]:
y.shape

(7000,)

In [25]:
x.shape

(7000,)

In [26]:
vectorizer = TfidfVectorizer()

# Fit and transform the raw text data
x = vectorizer.fit_transform(x)

In [27]:
x.shape

(7000, 49005)

Splitting the dataset to training and text data

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, stratify=y, random_state=2)

In [29]:
y_train.shape

(6300,)

Training the Model: Logistic Regression

In [30]:
model = LogisticRegression()

In [31]:
x_train.shape

(6300, 49005)

In [32]:
model.fit(x_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


Evaluation

In [33]:
#accuracy score on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [34]:
print("Accuracy Score of the Training Data: ", training_data_accuracy)

Accuracy Score of the Training Data:  0.9482539682539682


In [35]:
#accuracy score on test data
x_test_prediction = model.predict(x_test)
testing_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [36]:
print("Accuracy Score of the Testing Data: ", testing_data_accuracy)

Accuracy Score of the Testing Data:  0.9114285714285715


Predictive System

In [37]:
# test subject is the example from the data set we are checking
# 
test_subject = 600
x_new = x_test[test_subject]

prediction = model.predict(x_new)
print(prediction)

if prediction == 0:
    print("Prediction: News = Real")
else:
    print("Prediction: News = Fake")

print(y_test[test_subject])
if y_test[test_subject] == 0:
    print("In Reality: News = Real")
else:
    print("In Reality: News = Fake")

[0]
Prediction: News = Real
0
In Reality: News = Real
