In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#  Data Modelling Libraries
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier, ExtraTreesClassifier,VotingClassifier)

from sklearn.model_selection import (GridSearchCV, cross_val_score, cross_val_predict,StratifiedKFold, learning_curve)

from sklearn.metrics import (confusion_matrix, accuracy_score)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import warnings
from collections import Counter

sns.set(style = 'white' , context = 'notebook', palette = 'deep')
warnings.filterwarnings('ignore', category = DeprecationWarning)
%matplotlib inline
import re

import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

from nltk import word_tokenize
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
nltk.download('punkt')

nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from collections import Counter




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
!pip install textstat



In [None]:
#determine readability / complexity
import textstat



In [None]:
#quality of vocabulary
!pip install lexicalrichness



In [None]:
from lexicalrichness import LexicalRichness

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing the dataset

In [None]:
path = "/content/drive/MyDrive/data/covid_data.csv"
df = pd.read_csv(path)

In [None]:
df.head()


Unnamed: 0,id,title,text,label
0,1,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",Fake
1,2,Due to the recent outbreak for the Coronavirus...,Hydroxychloroquine has been shown to have a 10...,Fake
2,3,Due to the recent outbreak for the Coronavirus...,Fact: Hydroxychloroquine has been shown to hav...,Fake
3,4,Due to the recent outbreak for the Coronavirus...,The Corona virus is a man made virus created i...,Fake
4,5,Due to the recent outbreak for the Coronavirus...,Doesn?t @BillGates finance research at the Wuh...,Fake


We will see how many true and fake news we have

In [None]:
df['label'].value_counts()

TRUE    584
Fake    345
fake    230
Name: label, dtype: int64

The data set contains 586 true news and 578 fake news, almost 50/50 split.

 We will combine “title” and “text” into one feature “title_text” to make it simpler


In [None]:
df['label'].value_counts()

TRUE    584
Fake    345
fake    230
Name: label, dtype: int64

In [None]:
df.loc[df['label'] == 'Fake', ['label']] = 'FAKE'
df.loc[df['label'] == 'fake', ['label']] = 'FAKE'
df.text.fillna(df.title, inplace=True)

df = df.sample(frac=1).reset_index(drop=True)
df.title.fillna('missing', inplace=True)

df['title_text'] = df['title'] + ' ' + df['text']

An example of the title text combination

In [None]:
df['title_text'][1]

'Genomic Study Points to Natural Origin of COVID-19 No matter where you go online these days, there?s bound to be discussion of coronavirus disease 2019 (COVID-19). Some folks are even making outrageous claims that the new coronavirus causing the pandemic was engineered in a lab and deliberately released to make people sick. A new study debunks such claims by providing scientific evidence that this novel coronavirus arose naturally.The reassuring findings are the result of genomic analyses conducted by an international research team, partly supported by NIH. In their study in the journal Nature Medicine, Kristian Andersen, Scripps Research Institute, La Jolla, CA; Robert Garry, Tulane University School of Medicine, New Orleans; and their colleagues used sophisticated bioinformatic tools to compare publicly available genomic data from several coronaviruses, including the new one that causes COVID-19.The researchers began by homing in on the parts of the coronavirus genomes that encode t

In [None]:
df.head()

Unnamed: 0,id,title,text,label,title_text
0,1094,Immunomodulation in COVID-19,The coronavirus disease 2019 (COVID-19) pandem...,TRUE,Immunomodulation in COVID-19 The coronavirus d...
1,747,Genomic Study Points to Natural Origin of COVI...,"No matter where you go online these days, ther...",TRUE,Genomic Study Points to Natural Origin of COVI...
2,573,Plandemic,"Dr. Fauci, the director of the National Instit...",FAKE,"Plandemic Dr. Fauci, the director of the Natio..."
3,646,In the News: Coronavirus and ?Alternative? Tre...,Coronaviruses are a large family of viruses. S...,TRUE,In the News: Coronavirus and ?Alternative? Tre...
4,1145,Summer weather could help fight coronavirus sp...,New research has bolstered the hypothesis that...,TRUE,Summer weather could help fight coronavirus sp...


A new column has been created

## **Cleaning**

In [None]:
df.isnull().values.any()

True

In [None]:
df.isnull().values.sum()

5

In [None]:
df.fillna("general info", inplace = True)

We will strip off any html tags, punctuation, and make them lower case.

In [None]:
def preprocessor(text):

    text = re.sub('<[^>]*>', '', text)
    text = re.sub(r'[^\w\s]','', text)
    text = text.lower()

    return text

In [None]:
df['title_text'] = df['title_text'].apply(preprocessor)

In [None]:
df['title_text'][1]

'genomic study points to natural origin of covid19 no matter where you go online these days theres bound to be discussion of coronavirus disease 2019 covid19 some folks are even making outrageous claims that the new coronavirus causing the pandemic was engineered in a lab and deliberately released to make people sick a new study debunks such claims by providing scientific evidence that this novel coronavirus arose naturallythe reassuring findings are the result of genomic analyses conducted by an international research team partly supported by nih in their study in the journal nature medicine kristian andersen scripps research institute la jolla ca robert garry tulane university school of medicine new orleans and their colleagues used sophisticated bioinformatic tools to compare publicly available genomic data from several coronaviruses including the new one that causes covid19the researchers began by homing in on the parts of the coronavirus genomes that encode the spike proteins that

tokenization and stemming techniques together, and then apply the techniques on “title_text” later

In [None]:
df.head()


Unnamed: 0,id,title,text,label,title_text
0,1094,Immunomodulation in COVID-19,The coronavirus disease 2019 (COVID-19) pandem...,TRUE,immunomodulation in covid19 the coronavirus di...
1,747,Genomic Study Points to Natural Origin of COVI...,"No matter where you go online these days, ther...",TRUE,genomic study points to natural origin of covi...
2,573,Plandemic,"Dr. Fauci, the director of the National Instit...",FAKE,plandemic dr fauci the director of the nationa...
3,646,In the News: Coronavirus and ?Alternative? Tre...,Coronaviruses are a large family of viruses. S...,TRUE,in the news coronavirus and alternative treatm...
4,1145,Summer weather could help fight coronavirus sp...,New research has bolstered the hypothesis that...,TRUE,summer weather could help fight coronavirus sp...


# Data Analysis

## For Title

In [None]:
df['title_num_uppercase'] = df['title'].str.count(r'[A-Z]')


df['text_num_uppercase'] = df['text'].str.count(r'[A-Z]')
df['text_len'] = df['text'].str.len()


df['text_pct_uppercase'] = df.text_num_uppercase.div(df.text_len)

In [None]:
df['title_num_stop_words'] = df['title'].str.split().apply(lambda x: len(set(x) & stop_words))
df['text_num_stop_words'] = df['text'].str.split().apply(lambda x: len(set(x) & stop_words))


df['text_word_count'] = df['text'].apply(lambda x: len(str(x).split()))


df['text_pct_stop_words'] = df['text_num_stop_words'] / df['text_word_count']

## Capital letters

Lets Compute the percentage of capital letters in each article body rather than simply counting the number, because the length of the articles are very different.

In [None]:
df['title_num_uppercase'] = df['title'].str.count(r'[A-Z]')
df['text_num_uppercase'] = df['text'].str.count(r'[A-Z]')
df['text_len'] = df['text'].str.len()
df['text_pct_uppercase'] = df.text_num_uppercase.div(df.text_len)

x1 = df.loc[df['label']=='TRUE']['title_num_uppercase']
x2 = df.loc[df['label'] == 'FAKE']['title_num_uppercase']
group_labels = ['TRUE', 'FAKE']
colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Distribution of Uppercase in title', template="plotly_white")
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot of Capital Letter in title', template="plotly_white")
fig.show()

  By looking at the above plots, fake news have way more words that appear in capital letters in the title hence it looks like fake news is targeted to audiences who are likely to be influenced by titles

In [None]:
df.loc[df['label']=='TRUE']['title_num_uppercase'].describe()

count    584.000000
mean       4.578767
std        5.214174
min        0.000000
25%        1.000000
50%        3.000000
75%        6.000000
max       60.000000
Name: title_num_uppercase, dtype: float64

In [None]:
df.loc[df['label']=='FAKE']['title_num_uppercase'].describe()

count    575.000000
mean      16.836522
std       22.844674
min        0.000000
25%        4.000000
50%        8.000000
75%       16.000000
max      114.000000
Name: title_num_uppercase, dtype: float64

## Stop words

Since the length of the articles are very differentpercentage of short words in each article body rather than simply counting the number

In [None]:
df['title_num_stop_words'] = df['title'].str.split().apply(lambda x: len(set(x) & stop_words))
df['text_num_stop_words'] = df['text'].str.split().apply(lambda x: len(set(x) & stop_words))
df['text_word_count'] = df['text'].apply(lambda x: len(str(x).split()))
df['text_pct_stop_words'] = df['text_num_stop_words'] / df['text_word_count']

x1 = df.loc[df['label']=='TRUE']['title_num_stop_words']
x2 = df.loc[df['label'] == 'FAKE']['title_num_stop_words']
group_labels = ['TRUE', 'FAKE']
colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Distribution of Stop Words in title', template="plotly_white")
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot of Stop Words in title', template="plotly_white")
fig.show()

### Fake news titles have fewer stop-words than those of real news.

# Proper Noun

Count number of proper nouns in title

In [None]:
df.drop(['text_num_uppercase', 'text_len', 'text_num_stop_words', 'text_word_count'], axis=1, inplace=True)



In [None]:
df['token'] = df.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
df['pos_tags'] = df.apply(lambda row: nltk.pos_tag(row['token']), axis=1)

tag_count_df = pd.DataFrame(df['pos_tags'].map(lambda x: Counter(tag[1] for tag in x)).to_list())
df = pd.concat([df, tag_count_df], axis=1).fillna(0).drop(['pos_tags', 'token'], axis=1)

df = df[['title', 'text', 'label', 'title_num_uppercase', 'text_pct_uppercase', 'title_num_stop_words', 'text_pct_stop_words', 'NNP']].rename(columns={'NNP': 'NNP_title'})



In [None]:
x1 = df.loc[df['label']=='TRUE']['NNP_title']
x2 = df.loc[df['label'] == 'FAKE']['NNP_title']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Number of Proper nouns in title', template="plotly_white")
fig.show()

In [None]:
df.loc[df['label']=='TRUE']['NNP_title'].describe()

count    584.000000
mean       1.174658
std        1.742509
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max       12.000000
Name: NNP_title, dtype: float64

In [None]:
df.loc[df['label']=='FAKE']['NNP_title'].describe()

count    575.000000
mean       4.970435
std        4.213901
min        0.000000
25%        2.000000
50%        4.000000
75%        7.000000
max       23.000000
Name: NNP_title, dtype: float64

Fake news titles have more proper nouns. Apparently the use of proper nouns in titles are very significant in differentiating fake from real.

Overall, these results suggest that the writers of fake news are attempting to attracting attention by using all capitalized words, and squeeze as much substance into the titles as possible by skipping stop-words and increase proper nouns.

example:

Fake news title: "FULL TRANSCRIPT OF “SMOKING GUN” BOMBSHELL INTERVIEW: PROF. FRANCES BOYLE EXPOSES THE BIOWEAPONS ORIGINS OF THE COVID-19 CORONAVIRUS"

Real news title: "Why outbreaks like coronavirus spread exponentially, and how to 'flatten the curve'"

## Features

we compute many content based features on the tweet texts

to keep a count of how many times each tag appears in the tweet text.

In [None]:
df['token'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
df['pos_tags'] = df.apply(lambda row: nltk.pos_tag(row['token']), axis=1)

tag_count_df = pd.DataFrame(df['pos_tags'].map(lambda x: Counter(tag[1] for tag in x)).to_list())

df = pd.concat([df, tag_count_df], axis=1).fillna(0).drop(['pos_tags', 'token'], axis=1)

Number of negations, interrogatives in the text

In [None]:
df['num_negation'] = df['text'].str.lower().str.count("no|not|never|none|nothing|nobody|neither|nowhere|hardly|scarcely|barely|doesn’t|isn’t|wasn’t|shouldn’t|wouldn’t|couldn’t|won’t|can't|don't")

df['num_interrogatives_title'] = df['title'].str.lower().str.count("what|who|when|where|which|why|how")
df['num_interrogatives_text'] = df['text'].str.lower().str.count("what|who|when|where|which|why|how")

In [None]:
df.head()


Unnamed: 0,title,text,label,title_num_uppercase,text_pct_uppercase,title_num_stop_words,text_pct_stop_words,NNP_title,DT,NN,CD,(,NNP,),",",VBN,IN,JJ,VBZ,TO,VB,RB,CC,NNS,VBG,.,JJR,VBP,PRP,:,WDT,MD,JJS,RBR,VBD,EX,PRP$,WRB,WP,RP,RBS,PDT,NNPS,FW,WP$,POS,$,UH,``,'',SYM,#,num_negation,num_interrogatives_title,num_interrogatives_text
0,Immunomodulation in COVID-19,The coronavirus disease 2019 (COVID-19) pandem...,TRUE,6,0.037549,1,0.058824,1.0,58.0,153.0,11.0,9.0,66.0,9.0,60.0,33.0,116.0,129.0,16.0,17.0,34.0,30.0,47.0,75.0,24.0,42.0,3.0,25.0,9.0,2.0,5.0,17.0,3.0,4.0,10.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,0,8
1,Genomic Study Points to Natural Origin of COVI...,"No matter where you go online these days, ther...",TRUE,10,0.029079,2,0.073103,6.0,81.0,120.0,3.0,5.0,45.0,5.0,39.0,21.0,90.0,94.0,24.0,24.0,27.0,38.0,15.0,58.0,15.0,31.0,1.0,12.0,11.0,4.0,8.0,4.0,0.0,0.0,19.0,1.0,10.0,1.0,3.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16,0,10
2,Plandemic,"Dr. Fauci, the director of the National Instit...",FAKE,1,0.028571,0,0.233766,1.0,7.0,9.0,0.0,0.0,13.0,0.0,3.0,2.0,7.0,10.0,2.0,3.0,4.0,1.0,4.0,7.0,1.0,4.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
3,In the News: Coronavirus and ?Alternative? Tre...,Coronaviruses are a large family of viruses. S...,TRUE,5,0.040438,2,0.101235,2.0,30.0,59.0,3.0,4.0,25.0,4.0,19.0,9.0,42.0,46.0,8.0,14.0,28.0,12.0,30.0,45.0,7.0,22.0,1.0,18.0,8.0,1.0,2.0,5.0,2.0,1.0,1.0,2.0,7.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,0,3
4,Summer weather could help fight coronavirus sp...,New research has bolstered the hypothesis that...,TRUE,1,0.01768,2,0.054613,1.0,134.0,235.0,6.0,1.0,82.0,1.0,92.0,35.0,188.0,143.0,33.0,31.0,54.0,51.0,55.0,134.0,30.0,83.0,6.0,33.0,25.0,3.0,8.0,26.0,6.0,3.0,26.0,4.0,4.0,16.0,4.0,2.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20,0,24


In [None]:
df.loc[df['label']=='FAKE']['num_negation'].describe()

In [None]:
df.loc[df['label']=='FAKE']['num_interrogatives_title'].describe()

We will use a Python library — textstat to calculate statistics from text to determine readability, complexity and grade level of any article

In [None]:
reading_ease = []
for doc in df['text']:
    reading_ease.append(textstat.flesch_reading_ease(doc))

smog = []
for doc in df['text']:
    smog.append(textstat.smog_index(doc))

kincaid_grade = []
for doc in df['text']:
    kincaid_grade.append(textstat.flesch_kincaid_grade(doc))

liau_index = []
for doc in df['text']:
    liau_index.append(textstat.coleman_liau_index(doc))

readability_index = []
for doc in df['text']:
    readability_index.append(textstat.automated_readability_index(doc))

readability_score = []
for doc in df['text']:
    readability_score.append(textstat.dale_chall_readability_score(doc))

difficult_words = []
for doc in df['text']:
    difficult_words.append(textstat.difficult_words(doc))

write_formula = []
for doc in df['text']:
    write_formula.append(textstat.linsear_write_formula(doc))

gunning_fog = []
for doc in df['text']:
    gunning_fog.append(textstat.gunning_fog(doc))

text_standard = []
for doc in df['text']:
    text_standard.append(textstat.text_standard(doc))

df['flesch_reading_ease'] = reading_ease
df['smog_index'] = smog
df['flesch_kincaid_grade'] = kincaid_grade
df['automated_readability_index'] = readability_index
df['dale_chall_readability_score'] = readability_score
df['difficult_words'] = difficult_words
df['linsear_write_formula'] = write_formula
df['gunning_fog'] = gunning_fog
df['text_standard'] = text_standard

In [None]:
df.head()


Score	Difficulty
1. 90-100 Very Easy
2. 80-89	 Easy
3. 70-79  Fairly Easy
4. 60-69	 Standard
5. 50-59	 Fairly Difficult
6. 30-49	 Difficult
7. 0-29	 Very Confusing

## We will Use another Python library — lexicalrichness to find TTR(Type-token ratio )
TTR is the total number of unique words (types) divided by the total number of words (tokens) in a given segment of language.

In [None]:
ttr = []
for doc in df['text']:
    lex = LexicalRichness(doc)
    ttr.append(lex.ttr)

df['ttr'] = ttr

In [None]:
df.head()

Number of power words, casual words, tentative words, emotion words in the article body.

In [None]:
df['num_powerWords_text'] = df['text'].str.lower().str.count('improve|trust|immediately|discover|profit|learn|know|understand|powerful|best|win|more|bonus|exclusive|extra|you|free|health|guarantee|new|proven|safety|money|now|today|results|protect|help|easy|amazing|latest|extraordinary|how to|worst|ultimate|hot|first|big|anniversary|premiere|basic|complete|save|plus|create')
df['num_casualWords_text'] = df['text'].str.lower().str.count('make|because|how|why|change|use|since|reason|therefore|result')
df['num_tentativeWords_text'] = df['text'].str.lower().str.count('may|might|can|could|possibly|probably|it is likely|it is unlikely|it is possible|it is probable|tends to|appears to|suggests that|seems to')
df['num_emotionWords_text'] = df['text'].str.lower().str.count('ordeal|outrageous|provoke|repulsive|scandal|severe|shameful|shocking|terrible|tragic|unreliable|unstable|wicked|aggravate|agony|appalled|atrocious|corruption|damage|disastrous|disgusted|dreadful|eliminate|harmful|harsh|inconsiderate|enraged|offensive|aggressive|frustrated|controlling|resentful|anger|sad|fear|malicious|infuriated|critical|violent|vindictive|furious|contrary|condemning|sarcastic|poisonous|jealous|retaliating|desperate|alienated|unjustified|violated')

In [None]:
df.loc[df['label']=='FAKE']['num_powerWords_text'].describe()

In [None]:
df.loc[df['label']=='FAKE']['num_casualWords_text'].describe()

In [None]:
df.loc[df['label']=='FAKE']['num_tentativeWords_text'].describe()

In [None]:
df.loc[df['label']=='FAKE']['num_emotionWords_text'].describe()

In [None]:
df.head()

## For Tweets

## Capital Letters in Tweet

In [None]:
x1 = df.loc[df['label']=='TRUE']['text_pct_uppercase']
x2 = df.loc[df['label'] == 'FAKE']['text_pct_uppercase']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Percentage of Capital Letter in Article body', template="plotly_white")
fig.show()

In [None]:
df.loc[df['label']=='TRUE']['text_pct_uppercase'].describe()

count    584.000000
mean       0.026943
std        0.014330
min        0.000000
25%        0.018055
50%        0.024336
75%        0.033371
max        0.142857
Name: text_pct_uppercase, dtype: float64

In [None]:
df.loc[df['label']=='FAKE']['text_pct_uppercase'].describe()

count    575.000000
mean       0.040910
std        0.068307
min        0.000000
25%        0.023929
50%        0.031034
75%        0.040928
max        0.819048
Name: text_pct_uppercase, dtype: float64

On average, fake news have more words that appear in capital letters in the tweet body than those of real news.

## Stop Words

In [None]:
x1 = df.loc[df['label']=='TRUE']['text_pct_stop_words']
x2 = df.loc[df['label'] == 'FAKE']['text_pct_stop_words']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Percentage of Stop Words in Article Body', template="plotly_white")
fig.show()

In [None]:
df.loc[df['label']=='TRUE']['text_pct_stop_words'].describe()

count    584.000000
mean       0.159479
std        0.087453
min        0.000000
25%        0.080680
50%        0.156158
75%        0.219867
max        0.560000
Name: text_pct_stop_words, dtype: float64

In [None]:
df.loc[df['label']=='FAKE']['text_pct_stop_words'].describe()

count    575.000000
mean       0.144458
std        0.102079
min        0.000000
25%        0.065254
50%        0.105691
75%        0.210102
max        0.500000
Name: text_pct_stop_words, dtype: float64

there isn’t a significant difference on the percentage of stop words in article text between fake news and real news.

## Verb

In [None]:
x1 = df.loc[df['label']=='TRUE']['VBG']
x2 = df.loc[df['label'] == 'FAKE']['VBG']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Number of verbs in Text', template="plotly_white")
fig.show()

In [None]:
df.loc[df['label']=='FAKE']['VBG'].describe()

count    575.000000
mean      15.606957
std       20.258915
min        0.000000
25%        1.000000
50%        8.000000
75%       23.000000
max      162.000000
Name: VBG, dtype: float64

In [None]:
df.loc[df['label']=='FAKE']['VBG'].describe()

count    575.000000
mean      15.606957
std       20.258915
min        0.000000
25%        1.000000
50%        8.000000
75%       23.000000
max      162.000000
Name: VBG, dtype: float64

There is no significant difference on the number of verbs in real news or fake news

## Proper noun

In [None]:
x1 = df.loc[df['label']=='TRUE']['NNP']
x2 = df.loc[df['label'] == 'FAKE']['NNP']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Number of Proper noun in Article Body', template="plotly_white")
fig.show()

In [None]:
df.loc[df['label']=='TRUE']['NNP'].describe()

count    584.000000
mean      31.621575
std       48.126858
min        0.000000
25%        2.750000
50%       10.000000
75%       41.250000
max      372.000000
Name: NNP, dtype: float64

In [None]:
df.loc[df['label']=='FAKE']['NNP'].describe()

count    575.000000
mean      64.071304
std       87.241585
min        0.000000
25%        7.000000
50%       35.000000
75%       83.000000
max      709.000000
Name: NNP, dtype: float64

Similar to titles, fake news pack more proper nouns in the article bodies as well.

## Negation words

In [None]:
x1 = df.loc[df['label']=='TRUE']['num_negation']
x2 = df.loc[df['label'] == 'FAKE']['num_negation']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Number of Negations in Article Bodies', template="plotly_white")
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot of Negations in Article Bodies', template="plotly_white")
fig.show()

In [None]:
df.loc[df['label']=='TRUE']['num_negation'].describe()

count    584.000000
mean       9.825342
std       13.525922
min        0.000000
25%        2.000000
50%        4.000000
75%       13.000000
max       87.000000
Name: num_negation, dtype: float64

In [None]:
df.loc[df['label']=='FAKE']['num_negation'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: num_negation, dtype: float64

On average, fake news have a little more negation words than the real ones.

## Bracket

In [None]:
x1 = df.loc[df['label']=='TRUE']['(']
x2 = df.loc[df['label'] == 'FAKE']['(']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Number of Brackets in Article Bodies', template="plotly_white")
fig.show()

In [None]:
x1 = df.loc[df['label']=='TRUE'][')']
x2 = df.loc[df['label'] == 'FAKE'][')']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Number of Brackets in Text', template="plotly_white")
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot of Brackets in Article Bodies', template="plotly_white")
fig.show()

 fake news pack more brackets in the tweet text.

## Type-Token Ratio (TTR)

In [None]:
x1 = df.loc[df['label']=='TRUE']['ttr']
x2 = df.loc[df['label'] == 'FAKE']['ttr']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Type-token ratio in Article Bodies', template="plotly_white")
fig.show()

In [None]:
df.loc[df['label']=='TRUE']['ttr'].describe()

count    584.000000
mean       0.590981
std        0.161940
min        0.146718
25%        0.452776
50%        0.597823
75%        0.712946
max        1.000000
Name: ttr, dtype: float64

In [None]:
df.loc[df['label']=='FAKE']['ttr'].describe()

count    575.000000
mean       0.593458
std        0.194524
min        0.198005
25%        0.433481
50%        0.545296
75%        0.754589
max        1.000000
Name: ttr, dtype: float64

There does not seem to be a significant difference between fake news and real news in terms of TTR.

TTR shows the vocabulary in a document. A low TTR means a document has more word redundancy and a high TTR means a document has more word diversity.

# Conclusion

1. Fake news have way more words that appear in capital letters
2. Fake news titles have fewer stop-words than those of real news.
3. Fake news titles have more proper nouns.
4. there isn’t a significant difference on the percentage of stop words in article text between fake news and real news.
5. There is no significant difference on the number of verbs in real news or fake news
6. Similar to titles, fake news pack more proper nouns in the article bodies as well.
7. On average, fake news have a little more negation words than the real ones.
8. fake news pack more brackets in the tweet text.
9. There does not seem to be a significant difference between fake news and real news in terms of TTR.


## tokenizer and normalizer

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None,
                        tokenizer=tokenizer_porter,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True)
X = tfidf.fit_transform(df['title_text'])
Y = df.label.values

In [None]:
# Split the dataset into 80% Training set and 20% Testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0, shuffle=False)

Feature Scaling is a technique to standardize the independent features present in the data in a fixed range.
If feature scaling is not done, then a machine learning algorithm tends to weigh greater values, higher and consider smaller values as the lower values, regardless of the unit of the values

In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
def models(X_train ,Y_train ):

  #Using Logistic Regression Algorithm to the Training Set
  from sklearn.linear_model import LogisticRegression
  log = LogisticRegression(random_state = 0)
  log.fit(X_train , Y_train )

  #Using KNeighborsClassifier Method of neighbors class to use Nearest Neighbor algorithm
  from sklearn.neighbors import KNeighborsClassifier
  knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
  knn.fit(X_train , Y_train )

  #Using SVC method of svm class to use Support Vector Machine Algorithm
  from sklearn.svm import SVC
  svc_lin = SVC(kernel = 'linear', random_state = 0)
  svc_lin.fit(X_train , Y_train )

  #Using SVC method of svm class to use Kernel SVM Algorithm
  from sklearn.svm import SVC
  svc_rbf = SVC(kernel = 'rbf', random_state = 0)
  svc_rbf.fit(X_train , Y_train )

  #Using GaussianNB method of naïve_baY_train es class to use Naïve Bayes Algorithm
  from sklearn.naive_bayes import GaussianNB
  gauss = GaussianNB()
  gauss.fit(X_train , Y_train )

  #Using DecisionTreeClassifier of tree class to use Decision Tree Algorithm
  from sklearn.tree import DecisionTreeClassifier
  tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
  tree.fit(X_train , Y_train)

  #Using RandomForestClassifier method of ensemble class to use Random Forest Classification algorithm
  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
  forest.fit(X_train , Y_train)

  #print model accuracy on the training data.
  print('[0]Logistic Regression Training Accuracy:', log.score(X_train , Y_train))
  print('[1]K Nearest Neighbor Training Accuracy:', knn.score(X_train, Y_train))
  print('[2]Support Vector Machine (Linear Classifier) Training Accuracy:', svc_lin.score(X_train, Y_train))
  print('[3]Support Vector Machine (RBF Classifier) Training Accuracy:', svc_rbf.score(X_train, Y_train))
  print('[4]Gaussian Naive Bayes Training Accuracy:', gauss.score(X_train, Y_train))
  print('[5]Decision Tree Classifier Training Accuracy:', tree.score(X_train, Y_train))
  print('[6]Random Forest Classifier Training Accuracy:', forest.score(X_train, Y_train))

  return log, knn, svc_lin, svc_rbf, gauss, tree, forest

In [None]:
#Get and train all of the models
model = models(X_train.todense() ,Y_train)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[0]Logistic Regression Training Accuracy: 0.9989258861439313
[1]K Nearest Neighbor Training Accuracy: 0.5359828141783028
[2]Support Vector Machine (Linear Classifier) Training Accuracy: 0.9989258861439313
[3]Support Vector Machine (RBF Classifier) Training Accuracy: 0.920515574650913
[4]Gaussian Naive Bayes Training Accuracy: 0.9903329752953813
[5]Decision Tree Classifier Training Accuracy: 0.9989258861439313
[6]Random Forest Classifier Training Accuracy: 0.9935553168635876


1. Logistic Regression Training Accuracy: 0.9989258861439313
2.  K Nearest Neighbor Training Accuracy: 0.5349087003222341
3. Support Vector Machine (Linear Classifier) Training Accuracy: 0.9989258861439313
4. Support Vector Machine (RBF Classifier) Training Accuracy: 0.9377013963480129
5. Gaussian Naive Bayes Training Accuracy: 0.9903329752953813
6. Decision Tree Classifier Training Accuracy: 0.9989258861439313
7. Random Forest Classifier Training Accuracy: 0.9924812030075187

## Since Logistical regression and decision tree classifier has more accuracy we will train the model using Logistical Regression

In [None]:
from sklearn.linear_model import LogisticRegressionCV
import pickle

In [None]:
clf = LogisticRegressionCV(cv=5, scoring='accuracy', random_state=0, n_jobs=-1, verbose=3, max_iter=300).fit(X_train, Y_train)

fake_news_model = open('fake_news_model.sav', 'wb')
pickle.dump(clf, fake_news_model)
fake_news_model.close()

In [None]:
filename = 'fake_news_model.sav'
saved_clf = pickle.load(open(filename, 'rb'))

saved_clf.score(X_test, Y_test)

0.9227467811158798

In [None]:
#Print Prediction of Logistical Regression model
pred = model[0].predict(X_test)
print(pred)

#Print a space
print()

#Print the actual values
print(Y_test)

['TRUE' 'FAKE' 'TRUE' 'TRUE' 'TRUE' 'FAKE' 'FAKE' 'TRUE' 'FAKE' 'TRUE'
 'TRUE' 'FAKE' 'FAKE' 'TRUE' 'FAKE' 'FAKE' 'TRUE' 'FAKE' 'TRUE' 'FAKE'
 'FAKE' 'FAKE' 'FAKE' 'FAKE' 'TRUE' 'FAKE' 'TRUE' 'TRUE' 'TRUE' 'TRUE'
 'FAKE' 'FAKE' 'TRUE' 'FAKE' 'FAKE' 'FAKE' 'FAKE' 'FAKE' 'FAKE' 'FAKE'
 'TRUE' 'FAKE' 'TRUE' 'FAKE' 'TRUE' 'TRUE' 'FAKE' 'FAKE' 'FAKE' 'TRUE'
 'FAKE' 'TRUE' 'FAKE' 'TRUE' 'FAKE' 'TRUE' 'TRUE' 'FAKE' 'FAKE' 'FAKE'
 'FAKE' 'TRUE' 'FAKE' 'TRUE' 'FAKE' 'TRUE' 'FAKE' 'FAKE' 'TRUE' 'FAKE'
 'TRUE' 'FAKE' 'FAKE' 'TRUE' 'FAKE' 'TRUE' 'FAKE' 'FAKE' 'FAKE' 'FAKE'
 'FAKE' 'TRUE' 'TRUE' 'FAKE' 'FAKE' 'TRUE' 'TRUE' 'FAKE' 'FAKE' 'FAKE'
 'FAKE' 'TRUE' 'TRUE' 'FAKE' 'FAKE' 'FAKE' 'TRUE' 'FAKE' 'TRUE' 'TRUE'
 'FAKE' 'FAKE' 'FAKE' 'FAKE' 'FAKE' 'FAKE' 'TRUE' 'TRUE' 'TRUE' 'FAKE'
 'TRUE' 'FAKE' 'TRUE' 'FAKE' 'TRUE' 'FAKE' 'TRUE' 'TRUE' 'FAKE' 'TRUE'
 'FAKE' 'TRUE' 'FAKE' 'TRUE' 'TRUE' 'TRUE' 'TRUE' 'TRUE' 'FAKE' 'TRUE'
 'TRUE' 'TRUE' 'TRUE' 'TRUE' 'TRUE' 'FAKE' 'FAKE' 'FAKE' 'TRUE' 'TRUE'
 'FAKE

In [None]:
text = input("Enter the  news")
print(text)

In [None]:
#no of words>2
count=0
res = len(text.split())

if (res>2):
  count=count+1 ##true


#first char is capital
first_char = text[0]
if (first_char.isupper()):

    count=count+1 ##true


#no of upper words<2
res = text.split()
n=0

for i in res:
  if (i.isupper()):
       n=n+1 ##no of upper words
if (n<2):
  count=count+1 #true


#has atleast one '.' and does not end with ? or !
a=0
b=0
for l in res:
  if(l.endswith('.')):
    a=a+1
  if(l.endswith('?') or l.endswith('!')):
    b=b+1

  for x in l:
      if (x=='?' or x=='!'):
        b=b+1

if (a>=1):
    count=count+1 #true
if(b==0):
      count=count+1


#stopwords: a, of, on, with, for, at, the, in, to, from
sc=0
for s in res:
  if ( s=='of' or s=='on' or s=='with' or s=='for' or s=='at' or s=='the' or s=='in' or s=='to' or s=='from' or s=='is' or s=='I' or s=='are'):
    sc=sc+1
if (sc<2):
  count=count+1


#url
for u in res:
    if (u.startswith('http') or u.startswith('https') or u.startswith('www') and u.endswith('.com') or u.endswith('.in') or u.endswith('.org') or u.endswith('.gov') or u.endswith('.net') or u.endswith('.edu')):
      count=count+1


#@
tc=0
for t in res:
    if (t.startswith('@')):
      tc=tc+1
if(tc<=2):
  count=count+1


#negation words:not, never, none, nothing, uncessasary, no, neither, nowhere, useless, against, nor, without, nobody
negc=0
for neg in res:
  if (neg=='not' or neg=='never' or neg=='none' or neg=='nothing' or neg=='unecessary' or neg=='no' or neg=='neither' or neg=='nowhere' or neg=='useless' or neg=='against' or neg=='nor' or neg=='without' or neg=='nobody' or neg=='dont'):
    negc=negc+1
if(negc<=5):
  count=count+1


#brackets
brc=0
for br in res:
  if (br=='{' or br=='[' or br=="("):
      brc=brc+1
if(brc==0):
  count=count+1
print(count)
if (count>=8):
  print("The news is real!")
else:
  print("The news is fake!")



9
The news is real!
