# Fake News Project
# Import Dataset + Create Sub-Set + Pre-processing


In [None]:
# As our original dataset (nela-gt-2022) was too large (1,7 mio. rows), we created a stratified subset that kept 
# the ratio of sources identical to the original dataset. We kept 10% of the original dataset (177.830 observations) 
# and combined it with the dataset containing the labels (labels_all_2022). 

# We used two prediction classes only (0: reliable, 1: unreliable). 
# This combined and pre-processed dataset was used for exploratory data analysis and data viszalization.

# saving stratified subset combined with labels as: strats_new.pkl
# loading as: strats_new


# Import libraries

In [1]:
#importing Libraries
import numpy as np
import pandas as pd
from matplotlib.pylab import plt
import seaborn as sns
# from sqlalchemy import create_engine
# import sqlalchemy as 
import sqlite3
import nltk
from nltk.tokenize import word_tokenize

# Import nela-gt-2022 database (original dataset) as dataframe "db"


In [2]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("nela-gt-2022_db/nela-gt-2022.db")
db = pd.read_sql_query("SELECT * from newsdata", con)


In [3]:
# inspect first 5 rows of database
print(db.head())


                                                  id        date     source  \
0  sgtreport--2022-01-01--Masks purposely being f...  2022-01-01  sgtreport   
1  tass--2022-01-01--Stargazers get to see shooti...  2022-01-01       tass   
2  tass--2022-01-01--Kazakhstan picks up baton of...  2022-01-01       tass   
3  tass--2022-01-01--Defending red lines: What ch...  2022-01-01       tass   
4  tass--2022-01-01--Centuries-long saga of how t...  2022-01-01       tass   

                                               title  \
0  Masks purposely being forced on children to du...   
1  Stargazers get to see shooting stars, massive ...   
2  Kazakhstan picks up baton of CIS chairmanship,...   
3  Defending red lines: What challenges Russia an...   
4  Centuries-long saga of how the holiday tree ca...   

                                             content author  \
0  New research out of Brown University has found...    SGT   
1  MOSCOW, January 1. / TASS /. During the upcomi...          

In [4]:
# inspect columns of database
print(db.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1778363 entries, 0 to 1778362
Data columns (total 10 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   id              object
 1   date            object
 2   source          object
 3   title           object
 4   content         object
 5   author          object
 6   url             object
 7   published       object
 8   published_utc   int64 
 9   collection_utc  int64 
dtypes: int64(2), object(8)
memory usage: 135.7+ MB
None


# Create a stratified subset "strats10" dataframe being representative for the sources with 10% of the original data


In [6]:
# Check unique values and their counts for the column 'source'
db['source'].value_counts()


thesun                              71109
theindependent                      66302
usnews                              65905
eveningstandard                     59613
theguardianuk                       51219
                                    ...  
urbanintellectuals                      1
whatfinger                              1
themillenniumreport                     1
learntheriskorg                         1
thehuffingtonpostpoliticalsatire        1
Name: source, Length: 361, dtype: int64

In [7]:
# Get ratio instead of raw numbers using normalize=True
expected_ratio = db['source'].value_counts(normalize=True)

# Round and then convert to percentage
expected_ratio = expected_ratio.round(10)*100

# convert to a DataFrame and store in variable 'source_ratio'
# We'll use this variable to compare ratios for samples 
# selected using SRS and Stratified Sampling 
source_ratios = pd.DataFrame({'Expected':expected_ratio})
source_ratios

Unnamed: 0,Expected
thesun,3.998565
theindependent,3.728260
usnews,3.705936
eveningstandard,3.352128
theguardianuk,2.880121
...,...
urbanintellectuals,0.000056
whatfinger,0.000056
themillenniumreport,0.000056
learntheriskorg,0.000056


In [9]:
# Create stratified Sampling with 177.830 entries (10% of database)

# Use groupby and apply to select sample 
# which maintains the population group ratios
strats = db.groupby('source').apply(
    lambda x: x.sample(frac=0.10, random_state=123)
)


In [10]:
# inspect strats dataframe

strats.head()
# weird index!

Unnamed: 0_level_0,Unnamed: 1_level_0,id,date,source,title,content,author,url,published,published_utc,collection_utc
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
21stcenturywire,565171,21stcenturywire--2022-04-26--An Unsettling Rea...,2022-04-26,21stcenturywire,An Unsettling Realization: ‘Mask Zealots are S...,It’s been noted how the fanatical nature of ma...,NEWS WIRE,https://21stcenturywire.com/2022/04/26/an-unse...,"Tue, 26 Apr 2022 10:31:24 +0000",1650983484,1651010407
21stcenturywire,565172,21stcenturywire--2022-04-24--Episode #416 – ‘S...,2022-04-24,21stcenturywire,Episode #416 – ‘Soldiers of Fortune’ with gues...,Episode #416 of SUNDAY WIRE SHOW resumes this ...,NEWS WIRE,https://21stcenturywire.com/2022/04/24/episode...,"Sun, 24 Apr 2022 14:00:48 +0000",1650823248,1651010408
21stcenturywire,955703,21stcenturywire--2022-07-15--UKC News: UK Thou...,2022-07-15,21stcenturywire,"UKC News: UK Thought Police, France Stops Vacc...",Authoritarians in the UK government are gettin...,NEWS WIRE,https://21stcenturywire.com/2022/07/15/ukc-new...,"Fri, 15 Jul 2022 16:26:25 +0000",1657916785,1657922407
21stcenturywire,691952,21stcenturywire--2022-05-22--Twitter Audit Fin...,2022-05-22,21stcenturywire,Twitter Audit Finds Half of President Biden’s ...,"After the 2020 election, many Americans began ...",NEWS WIRE,https://21stcenturywire.com/2022/05/22/twitter...,"Sun, 22 May 2022 12:17:48 +0000",1653236268,1653256808
21stcenturywire,344762,21stcenturywire--2022-03-11--UKC News: The Tru...,2022-03-11,21stcenturywire,UKC News: The Truth About US Bio Labs + White ...,The high stakes game surrounding the Ukraine c...,NEWS WIRE,https://21stcenturywire.com/2022/03/11/ukc-new...,"Fri, 11 Mar 2022 19:18:16 +0000",1647044296,1647039605


In [11]:
# Remove the extra index added by groupby()
### ONLY RUN ONCE OTHERWISE INDEX MIGHT BE DELETED
strats = strats.droplevel(0)


In [12]:
# check whether it worked
display(strats.head())


Unnamed: 0,id,date,source,title,content,author,url,published,published_utc,collection_utc
565171,21stcenturywire--2022-04-26--An Unsettling Rea...,2022-04-26,21stcenturywire,An Unsettling Realization: ‘Mask Zealots are S...,It’s been noted how the fanatical nature of ma...,NEWS WIRE,https://21stcenturywire.com/2022/04/26/an-unse...,"Tue, 26 Apr 2022 10:31:24 +0000",1650983484,1651010407
565172,21stcenturywire--2022-04-24--Episode #416 – ‘S...,2022-04-24,21stcenturywire,Episode #416 – ‘Soldiers of Fortune’ with gues...,Episode #416 of SUNDAY WIRE SHOW resumes this ...,NEWS WIRE,https://21stcenturywire.com/2022/04/24/episode...,"Sun, 24 Apr 2022 14:00:48 +0000",1650823248,1651010408
955703,21stcenturywire--2022-07-15--UKC News: UK Thou...,2022-07-15,21stcenturywire,"UKC News: UK Thought Police, France Stops Vacc...",Authoritarians in the UK government are gettin...,NEWS WIRE,https://21stcenturywire.com/2022/07/15/ukc-new...,"Fri, 15 Jul 2022 16:26:25 +0000",1657916785,1657922407
691952,21stcenturywire--2022-05-22--Twitter Audit Fin...,2022-05-22,21stcenturywire,Twitter Audit Finds Half of President Biden’s ...,"After the 2020 election, many Americans began ...",NEWS WIRE,https://21stcenturywire.com/2022/05/22/twitter...,"Sun, 22 May 2022 12:17:48 +0000",1653236268,1653256808
344762,21stcenturywire--2022-03-11--UKC News: The Tru...,2022-03-11,21stcenturywire,UKC News: The Truth About US Bio Labs + White ...,The high stakes game surrounding the Ukraine c...,NEWS WIRE,https://21stcenturywire.com/2022/03/11/ukc-new...,"Fri, 11 Mar 2022 19:18:16 +0000",1647044296,1647039605


In [13]:
# inspect the columns of the strats dataframe
display(strats.info())
# 177830 observations

<class 'pandas.core.frame.DataFrame'>
Int64Index: 177831 entries, 565171 to 1226255
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              177831 non-null  object
 1   date            177831 non-null  object
 2   source          177831 non-null  object
 3   title           177831 non-null  object
 4   content         177831 non-null  object
 5   author          177831 non-null  object
 6   url             177831 non-null  object
 7   published       177831 non-null  object
 8   published_utc   177831 non-null  int64 
 9   collection_utc  177831 non-null  int64 
dtypes: int64(2), object(8)
memory usage: 14.9+ MB


None

In [14]:
# compare ratio of sources from db with strats

# Ratio of selected items by the source
stratified_ratio = strats['source'].value_counts(normalize=True)
# Convert to percentage
stratified_ratio = stratified_ratio.round(4)*100
# We did stratified sampling. So give it proper name
stratified_ratio.name = 'Stratified'


In [15]:
# Add it to the variable source_ratios which already has 
# the  expected and SRS proportions 
source_ratios = pd.concat([source_ratios, stratified_ratio], axis=1)
source_ratios

# nice similar ratios

Unnamed: 0,Expected,Stratified
thesun,3.998565,4.00
theindependent,3.728260,3.73
usnews,3.705936,3.71
eveningstandard,3.352128,3.35
theguardianuk,2.880121,2.88
...,...,...
urbanintellectuals,0.000056,
whatfinger,0.000056,
themillenniumreport,0.000056,
learntheriskorg,0.000056,


# Import and pre-process labels_all_2022 document


In [26]:
# import document as dataframe
labels_all_2022 = pd.read_csv("labels_all_2022.csv")


In [28]:
## calculate frequencies of labels
labels_all_2022["label"].value_counts()


 1    233
 0    115
 2     40
-1      4
Name: label, dtype: int64

In [29]:
### DELETE -1 in labels (delete 4 sources with missin label)
labels_all_2022 = labels_all_2022[labels_all_2022["label"] != -1]


In [30]:
### DELETE2 in labels (delete 4 sources)
labels_all_2022 = labels_all_2022[labels_all_2022["label"] != 2]


In [31]:
## calculate frequencies of labels
labels_all_2022["label"].value_counts()


1    233
0    115
Name: label, dtype: int64

# Merge new strats subset with labels_all_2022 document
# create dataframe "strats_new"

In [72]:
# do the merging
strats_new = strats.merge(right=labels_all_2022, on="source", how="left")
target = strats_new["label"]

In [73]:
# inspect the dataframe
strats_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 177831 entries, 0 to 177830
Data columns (total 17 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id                        177831 non-null  object 
 1   date                      177831 non-null  object 
 2   source                    177831 non-null  object 
 3   title                     177831 non-null  object 
 4   content                   177831 non-null  object 
 5   author                    177831 non-null  object 
 6   url                       177831 non-null  object 
 7   published                 177831 non-null  object 
 8   published_utc             177831 non-null  int64  
 9   collection_utc            177831 non-null  int64  
 10  country                   106074 non-null  object 
 11  label                     111280 non-null  float64
 12  bias                      111280 non-null  object 
 13  factuality                110158 non-null  f

In [74]:
# inspect the dataframe
strats_new.head(5)

Unnamed: 0,id,date,source,title,content,author,url,published,published_utc,collection_utc,country,label,bias,factuality,questionable-source,conspiracy-pseudoscience,pro-science
0,21stcenturywire--2022-04-26--An Unsettling Rea...,2022-04-26,21stcenturywire,An Unsettling Realization: ‘Mask Zealots are S...,It’s been noted how the fanatical nature of ma...,NEWS WIRE,https://21stcenturywire.com/2022/04/26/an-unse...,"Tue, 26 Apr 2022 10:31:24 +0000",1650983484,1651010407,USA,1.0,conspiracy-pseudoscience,2.0,0.0,1.0,0.0
1,21stcenturywire--2022-04-24--Episode #416 – ‘S...,2022-04-24,21stcenturywire,Episode #416 – ‘Soldiers of Fortune’ with gues...,Episode #416 of SUNDAY WIRE SHOW resumes this ...,NEWS WIRE,https://21stcenturywire.com/2022/04/24/episode...,"Sun, 24 Apr 2022 14:00:48 +0000",1650823248,1651010408,USA,1.0,conspiracy-pseudoscience,2.0,0.0,1.0,0.0
2,21stcenturywire--2022-07-15--UKC News: UK Thou...,2022-07-15,21stcenturywire,"UKC News: UK Thought Police, France Stops Vacc...",Authoritarians in the UK government are gettin...,NEWS WIRE,https://21stcenturywire.com/2022/07/15/ukc-new...,"Fri, 15 Jul 2022 16:26:25 +0000",1657916785,1657922407,USA,1.0,conspiracy-pseudoscience,2.0,0.0,1.0,0.0
3,21stcenturywire--2022-05-22--Twitter Audit Fin...,2022-05-22,21stcenturywire,Twitter Audit Finds Half of President Biden’s ...,"After the 2020 election, many Americans began ...",NEWS WIRE,https://21stcenturywire.com/2022/05/22/twitter...,"Sun, 22 May 2022 12:17:48 +0000",1653236268,1653256808,USA,1.0,conspiracy-pseudoscience,2.0,0.0,1.0,0.0
4,21stcenturywire--2022-03-11--UKC News: The Tru...,2022-03-11,21stcenturywire,UKC News: The Truth About US Bio Labs + White ...,The high stakes game surrounding the Ukraine c...,NEWS WIRE,https://21stcenturywire.com/2022/03/11/ukc-new...,"Fri, 11 Mar 2022 19:18:16 +0000",1647044296,1647039605,USA,1.0,conspiracy-pseudoscience,2.0,0.0,1.0,0.0


In [75]:
# inspect labels
strats_new["label"].value_counts()

# slightly unbalanced

0.0    61222
1.0    50058
Name: label, dtype: int64

## Word count + count characters

In [76]:
#word count content
strats_new["word_count_content"] = strats_new["content"].apply(lambda x:len(x.split(" ")))


In [77]:
#word count title
strats_new["word_count_title"] = strats_new["title"].apply(lambda x:len(x.split(" ")))


In [78]:
#### Count characters content
strats_new['nb_character_content'] = strats_new.content.apply(len)


In [79]:
# Count characters content
strats_new['nb_character_title'] = strats_new.title.apply(len)


In [80]:
# inspect final strats_new dataset
strats_new.head()


Unnamed: 0,id,date,source,title,content,author,url,published,published_utc,collection_utc,...,label,bias,factuality,questionable-source,conspiracy-pseudoscience,pro-science,word_count_content,word_count_title,nb_character_content,nb_character_title
0,21stcenturywire--2022-04-26--An Unsettling Rea...,2022-04-26,21stcenturywire,An Unsettling Realization: ‘Mask Zealots are S...,It’s been noted how the fanatical nature of ma...,NEWS WIRE,https://21stcenturywire.com/2022/04/26/an-unse...,"Tue, 26 Apr 2022 10:31:24 +0000",1650983484,1651010407,...,1.0,conspiracy-pseudoscience,2.0,0.0,1.0,0.0,108,8,720,61
1,21stcenturywire--2022-04-24--Episode #416 – ‘S...,2022-04-24,21stcenturywire,Episode #416 – ‘Soldiers of Fortune’ with gues...,Episode #416 of SUNDAY WIRE SHOW resumes this ...,NEWS WIRE,https://21stcenturywire.com/2022/04/24/episode...,"Sun, 24 Apr 2022 14:00:48 +0000",1650823248,1651010408,...,1.0,conspiracy-pseudoscience,2.0,0.0,1.0,0.0,179,11,1143,66
2,21stcenturywire--2022-07-15--UKC News: UK Thou...,2022-07-15,21stcenturywire,"UKC News: UK Thought Police, France Stops Vacc...",Authoritarians in the UK government are gettin...,NEWS WIRE,https://21stcenturywire.com/2022/07/15/ukc-new...,"Fri, 15 Jul 2022 16:26:25 +0000",1657916785,1657922407,...,1.0,conspiracy-pseudoscience,2.0,0.0,1.0,0.0,207,13,1340,82
3,21stcenturywire--2022-05-22--Twitter Audit Fin...,2022-05-22,21stcenturywire,Twitter Audit Finds Half of President Biden’s ...,"After the 2020 election, many Americans began ...",NEWS WIRE,https://21stcenturywire.com/2022/05/22/twitter...,"Sun, 22 May 2022 12:17:48 +0000",1653236268,1653256808,...,1.0,conspiracy-pseudoscience,2.0,0.0,1.0,0.0,504,11,3274,69
4,21stcenturywire--2022-03-11--UKC News: The Tru...,2022-03-11,21stcenturywire,UKC News: The Truth About US Bio Labs + White ...,The high stakes game surrounding the Ukraine c...,NEWS WIRE,https://21stcenturywire.com/2022/03/11/ukc-new...,"Fri, 11 Mar 2022 19:18:16 +0000",1647044296,1647039605,...,1.0,conspiracy-pseudoscience,2.0,0.0,1.0,0.0,147,14,908,70


In [81]:
strats_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 177831 entries, 0 to 177830
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id                        177831 non-null  object 
 1   date                      177831 non-null  object 
 2   source                    177831 non-null  object 
 3   title                     177831 non-null  object 
 4   content                   177831 non-null  object 
 5   author                    177831 non-null  object 
 6   url                       177831 non-null  object 
 7   published                 177831 non-null  object 
 8   published_utc             177831 non-null  int64  
 9   collection_utc            177831 non-null  int64  
 10  country                   106074 non-null  object 
 11  label                     111280 non-null  float64
 12  bias                      111280 non-null  object 
 13  factuality                110158 non-null  f

# STORE NEW DATASET AS PKL

In [34]:
strats_new.to_pickle("strats_new.pkl")