We will be using logistic regression to predict weather news is fake or not we will also use nltk in it

In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
data = pd.read_csv("/content/FakeNewsNet.csv")

In [37]:
data.isnull().sum()

title              0
news_url         330
source_domain    330
tweet_num          0
real               0
dtype: int64

In [38]:
#Filling null spaces with empty space
data.fillna("")

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1
...,...,...,...,...,...
23191,Pippa Middleton wedding: In case you missed it...,https://www.express.co.uk/news/royal/807049/pi...,www.express.co.uk,52,1
23192,Zayn Malik & Gigi Hadid’s Shocking Split: Why ...,hollywoodlife.com/2018/03/13/zayn-malik-gigi-h...,hollywoodlife.com,7,0
23193,Jessica Chastain Recalls the Moment Her Mother...,http://www.justjared.com/2018/01/17/jessica-ch...,www.justjared.com,26,1
23194,"Tristan Thompson Feels ""Dumped"" After Khloé Ka...",www.intouchweekly.com/posts/tristan-thompson-f...,www.intouchweekly.com,24,0


In [39]:
data['content'] = data['source_domain']+' '+data['title']

In [41]:
port_stem = PorterStemmer()
def stemming(content):
    if isinstance(content, str):
        # Remove non-alphabetic characters and replace with space
        stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
        # Tokenize the content
        stemmed_content = stemmed_content.split()
        # Stem each word and filter out stopwords
        stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
        # Join the stemmed words back into a string
        stemmed_content = ' '.join(stemmed_content)
        return stemmed_content
    else:
        # Return empty string for non-string or bytes-like objects
        return ''


data["content"] = data["content"].apply(stemming)

In [42]:
data["content"]

0        toofab com kandi burruss explod over rape accu...
1        www today com peopl choic award the best red c...
2        www etonlin com sophia bush send sweet birthda...
3        www dailymail co uk colombian singer maluma sp...
4        www zerchoo com gossip girl year later how upp...
                               ...                        
23191    www express co uk pippa middleton wed in case ...
23192    hollywoodlif com zayn malik gigi hadid shock s...
23193    www justjar com jessica chastain recal moment ...
23194    www intouchweekli com tristan thompson feel du...
23195    www billboard com kelli clarkson perform medle...
Name: content, Length: 23196, dtype: object

In [43]:
X = data["content"]
Y = data["real"]

In [44]:
tfidf = TfidfVectorizer()
X= tfidf.fit_transform(X)

In [45]:
print(X)

  (0, 13701)	0.206382954006351
  (0, 10578)	0.2488070840615655
  (0, 662)	0.30038533395778794
  (0, 5907)	0.2553049539118059
  (0, 10290)	0.22617469002462648
  (0, 61)	0.25380347136848513
  (0, 10237)	0.29528672648724646
  (0, 9193)	0.21149386336988618
  (0, 4203)	0.3469868545710738
  (0, 1719)	0.37868162916138465
  (0, 6666)	0.37868162916138465
  (0, 2469)	0.04966607810668577
  (0, 13001)	0.2879720406744054
  (1, 7451)	0.32473106818112224
  (1, 1928)	0.37790287714259024
  (1, 10358)	0.3676760898249685
  (1, 1113)	0.3303527143335857
  (1, 12665)	0.21980480411235595
  (1, 740)	0.3013132087080524
  (1, 2229)	0.4120403033560082
  (1, 9494)	0.2311332217558636
  (1, 12966)	0.3590431069685344
  (1, 14304)	0.09936446158630897
  (1, 2469)	0.07957196478049328
  (2, 4113)	0.25241997932571214
  :	:
  (23194, 5818)	0.23732308447220415
  (23194, 6253)	0.2611684511497142
  (23194, 168)	0.16712465234122792
  (23194, 7013)	0.23829271334181795
  (23194, 6680)	0.15834930322833224
  (23194, 12960)	0.1714

In [46]:
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size = 0.2 , stratify = Y,random_state = 2)

In [50]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)

# Perform grid search
grid_search.fit(X_train, Y_train)

# Get the best hyperparameters
best_C = grid_search.best_params_['C']

# Use the best hyperparameters to train the model
best_model = LogisticRegression(C=best_C)
best_model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [52]:
X_train_prediction = best_model.predict(X_train)
train_accuracy = accuracy_score(X_train_prediction,Y_train)
print(train_accuracy)

0.9376482000431128


In [53]:
X_test_prediction = best_model.predict(X_test)
test_accuracy = accuracy_score(X_test_prediction,Y_test)
print(test_accuracy)

0.8568965517241379


In [55]:
input = X_test[4]
prediction = best_model.predict(input)
print(prediction)

[1]
