In [1]:
#TODO
# 1) Preprocessing, Cleaning Text Data
## Removal of Stop words, Tokenization Stemming
# 2) Feature Extraction and Data transformation
## Bag of Words, TD-IDF, Scaling or Feature Engineering
# 3) ML Model (Model Building, Evaluation and Tuning)
## a) Naive Bayes
## b) Logistic Regression

In [2]:
#importing packages
import pandas as pd
import re

In [3]:
#reading the data
df=pd.read_json('archive/politifact_factcheck_data.json',lines=True)

In [4]:
#reading first 5 rows
df.head()

Unnamed: 0,verdict,statement_originator,statement,statement_date,statement_source,factchecker,factcheck_date,factcheck_analysis_link
0,true,Barack Obama,John McCain opposed bankruptcy protections for...,6/11/2008,speech,Adriel Bettelheim,6/16/2008,https://www.politifact.com/factchecks/2008/jun...
1,false,Matt Gaetz,"""Bennie Thompson actively cheer-led riots in t...",6/7/2022,television,Yacob Reyes,6/13/2022,https://www.politifact.com/factchecks/2022/jun...
2,mostly-true,Kelly Ayotte,"Says Maggie Hassan was ""out of state on 30 day...",5/18/2016,news,Clay Wirestone,5/27/2016,https://www.politifact.com/factchecks/2016/may...
3,false,Bloggers,"""BUSTED: CDC Inflated COVID Numbers, Accused o...",2/1/2021,blog,Madison Czopek,2/5/2021,https://www.politifact.com/factchecks/2021/feb...
4,half-true,Bobby Jindal,"""I'm the only (Republican) candidate that has ...",8/30/2015,television,Linda Qiu,8/30/2015,https://www.politifact.com/factchecks/2015/aug...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21152 entries, 0 to 21151
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   verdict                  21152 non-null  object
 1   statement_originator     21152 non-null  object
 2   statement                21152 non-null  object
 3   statement_date           21152 non-null  object
 4   statement_source         21152 non-null  object
 5   factchecker              21152 non-null  object
 6   factcheck_date           21152 non-null  object
 7   factcheck_analysis_link  21152 non-null  object
dtypes: object(8)
memory usage: 1.3+ MB


In [6]:
df.describe(include='all')

Unnamed: 0,verdict,statement_originator,statement,statement_date,statement_source,factchecker,factcheck_date,factcheck_analysis_link
count,21152,21152,21152,21152,21152,21152,21152,21152
unique,6,4565,21139,4751,13,563,4322,21151
top,false,Facebook posts,Says Mitt Romney flip-flopped on abortion.,11/4/2020,news,Louis Jacobson,2/20/2020,https://www.politifact.com/factchecks/2020/mar...
freq,5625,1914,2,30,5763,1834,19,2


# **Data Cleaning**

## Checking Null Values

In [7]:
df.isnull().sum()

verdict                    0
statement_originator       0
statement                  0
statement_date             0
statement_source           0
factchecker                0
factcheck_date             0
factcheck_analysis_link    0
dtype: int64

**No null values has been found in the dataset.**

## Changing Data Types

**statement_date and factcheck_date column is in object. We want them to be in datatime format as we look for the values in them that are given in date-time.**

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21152 entries, 0 to 21151
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   verdict                  21152 non-null  object
 1   statement_originator     21152 non-null  object
 2   statement                21152 non-null  object
 3   statement_date           21152 non-null  object
 4   statement_source         21152 non-null  object
 5   factchecker              21152 non-null  object
 6   factcheck_date           21152 non-null  object
 7   factcheck_analysis_link  21152 non-null  object
dtypes: object(8)
memory usage: 1.3+ MB


In [9]:
# Convert 'statement_date' to datetime
df['statement_date'] = pd.to_datetime(df['statement_date'])

# Convert 'factcheck_date' to datetime
df['factcheck_date'] = pd.to_datetime(df['factcheck_date'])


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21152 entries, 0 to 21151
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   verdict                  21152 non-null  object        
 1   statement_originator     21152 non-null  object        
 2   statement                21152 non-null  object        
 3   statement_date           21152 non-null  datetime64[ns]
 4   statement_source         21152 non-null  object        
 5   factchecker              21152 non-null  object        
 6   factcheck_date           21152 non-null  datetime64[ns]
 7   factcheck_analysis_link  21152 non-null  object        
dtypes: datetime64[ns](2), object(6)
memory usage: 1.3+ MB


## Checking Duplicates

In [11]:
# Check for exact duplicate rows across all columns
duplicate_rows = df[df.duplicated()]

In [12]:
duplicate_rows

Unnamed: 0,verdict,statement_originator,statement,statement_date,statement_source,factchecker,factcheck_date,factcheck_analysis_link
20100,False,YouTube videos,"""The coronavirus was invented and patented in ...",2020-02-19,social_media,Tom Kertscher,2020-03-10,https://www.politifact.com/factchecks/2020/mar...


**There is a single duplicate row in the dataset that we intend to remove.**

In [13]:
# Drop duplicate rows
df=df.drop_duplicates()

In [14]:
# Reset the index of the cleaned DataFrame
df=df.reset_index(drop=True)

**I've decided to remove three columns from the dataset: `statement_date`, `factcheck_date`, and `factcheck_analysis_link`, as we won't be utilizing them for our analysis.**

In [15]:
# List of columns to drop
columns_to_drop = ['statement_date', 'factcheck_date', 'factcheck_analysis_link']

# Drop the specified columns from the DataFrame
df.drop(columns=columns_to_drop, inplace=True)

# Display the first few rows of the DataFrame after dropping columns
print("DataFrame after dropping specified columns:")
print(df.head())

DataFrame after dropping specified columns:
       verdict statement_originator  \
0         true         Barack Obama   
1        false           Matt Gaetz   
2  mostly-true         Kelly Ayotte   
3        false             Bloggers   
4    half-true         Bobby Jindal   

                                           statement statement_source  \
0  John McCain opposed bankruptcy protections for...           speech   
1  "Bennie Thompson actively cheer-led riots in t...       television   
2  Says Maggie Hassan was "out of state on 30 day...             news   
3  "BUSTED: CDC Inflated COVID Numbers, Accused o...             blog   
4  "I'm the only (Republican) candidate that has ...       television   

         factchecker  
0  Adriel Bettelheim  
1        Yacob Reyes  
2     Clay Wirestone  
3     Madison Czopek  
4          Linda Qiu  


## Convert to Lowercase

In [16]:
# Apply lowercase transformation to all columns in the DataFrame
df = df.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
df

Unnamed: 0,verdict,statement_originator,statement,statement_source,factchecker
0,true,barack obama,john mccain opposed bankruptcy protections for...,speech,adriel bettelheim
1,false,matt gaetz,"""bennie thompson actively cheer-led riots in t...",television,yacob reyes
2,mostly-true,kelly ayotte,"says maggie hassan was ""out of state on 30 day...",news,clay wirestone
3,false,bloggers,"""busted: cdc inflated covid numbers, accused o...",blog,madison czopek
4,half-true,bobby jindal,"""i'm the only (republican) candidate that has ...",television,linda qiu
...,...,...,...,...,...
21146,mostly-false,donald trump,says the large trade deficit with japan stems ...,speech,jon greenberg
21147,false,donald trump jr.,"""tens of thousands"" of people leave new york e...",social_media,jill terreri ramos
21148,mostly-false,chris abele,"""i have fought for our shared values without b...",news,dave umhoefer
21149,false,bloggers,"""germany halts all covid-19 vaccines, says the...",blog,ciara o'rourke


## Expanding Contractions

In [17]:
# Dictionary mapping contractions to their expanded forms
contractions_dict = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who're": "who are",
    "who's": "who is",
    "who've": "who have",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

# Function to expand contractions in a given text using the contractions dictionary
def expand_contractions(text, contractions_dict):
    # Regular expression pattern to find contractions in text
    pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b')

    # Function to expand a matched contraction using the dictionary
    def expand_match(contraction):
        match = contraction.group(0)
        expanded = contractions_dict.get(match)
        if not expanded:
            expanded = contractions_dict.get(match.lower())
        return expanded

    # Apply the contraction expansion function to the text
    expanded_text = pattern.sub(expand_match, text)
    return expanded_text

# Apply contraction expansion to all cells of the DataFrame
df['statement'] = df['statement'].apply(lambda x: expand_contractions(x, contractions_dict))

df

Unnamed: 0,verdict,statement_originator,statement,statement_source,factchecker
0,true,barack obama,john mccain opposed bankruptcy protections for...,speech,adriel bettelheim
1,false,matt gaetz,"""bennie thompson actively cheer-led riots in t...",television,yacob reyes
2,mostly-true,kelly ayotte,"says maggie hassan was ""out of state on 30 day...",news,clay wirestone
3,false,bloggers,"""busted: cdc inflated covid numbers, accused o...",blog,madison czopek
4,half-true,bobby jindal,"""i am the only (republican) candidate that has...",television,linda qiu
...,...,...,...,...,...
21146,mostly-false,donald trump,says the large trade deficit with japan stems ...,speech,jon greenberg
21147,false,donald trump jr.,"""tens of thousands"" of people leave new york e...",social_media,jill terreri ramos
21148,mostly-false,chris abele,"""i have fought for our shared values without b...",news,dave umhoefer
21149,false,bloggers,"""germany halts all covid-19 vaccines, says the...",blog,ciara o'rourke


## Remove Special Characters and Punctuation

In [18]:
# Function to remove special characters and punctuation from text
def remove_special_characters(text):
    if isinstance(text, str):  # Check if the input is a string
        pattern = r'[^A-Za-z0-9\s]'  # Define regex pattern to match special characters
        return re.sub(pattern, '', text)  # Remove special characters using regex

# Apply function to remove special characters to text columns only
for col in df.columns:
    if df[col].dtype == 'object':  # Check if the column contains string (object) data
        df[col] = df[col].apply(remove_special_characters)
df

Unnamed: 0,verdict,statement_originator,statement,statement_source,factchecker
0,true,barack obama,john mccain opposed bankruptcy protections for...,speech,adriel bettelheim
1,false,matt gaetz,bennie thompson actively cheerled riots in the...,television,yacob reyes
2,mostlytrue,kelly ayotte,says maggie hassan was out of state on 30 days...,news,clay wirestone
3,false,bloggers,busted cdc inflated covid numbers accused of v...,blog,madison czopek
4,halftrue,bobby jindal,i am the only republican candidate that has ac...,television,linda qiu
...,...,...,...,...,...
21146,mostlyfalse,donald trump,says the large trade deficit with japan stems ...,speech,jon greenberg
21147,false,donald trump jr,tens of thousands of people leave new york eve...,socialmedia,jill terreri ramos
21148,mostlyfalse,chris abele,i have fought for our shared values without be...,news,dave umhoefer
21149,false,bloggers,germany halts all covid19 vaccines says they a...,blog,ciara orourke


## Removing Numbers

In [19]:
# Function to remove numbers using regular expressions
def remove_numbers(text):
    # Define regex pattern to match numbers (\d+ matches one or more digits)
    pattern = r'\d+'
    # Replace numbers with empty string (remove numbers)
    return re.sub(pattern, '', str(text))  # Convert to string before applying regex

# Apply function to remove numbers to all columns in the DataFrame
df = df.applymap(remove_numbers)

df

Unnamed: 0,verdict,statement_originator,statement,statement_source,factchecker
0,true,barack obama,john mccain opposed bankruptcy protections for...,speech,adriel bettelheim
1,false,matt gaetz,bennie thompson actively cheerled riots in the s,television,yacob reyes
2,mostlytrue,kelly ayotte,says maggie hassan was out of state on days o...,news,clay wirestone
3,false,bloggers,busted cdc inflated covid numbers accused of v...,blog,madison czopek
4,halftrue,bobby jindal,i am the only republican candidate that has ac...,television,linda qiu
...,...,...,...,...,...
21146,mostlyfalse,donald trump,says the large trade deficit with japan stems ...,speech,jon greenberg
21147,false,donald trump jr,tens of thousands of people leave new york eve...,socialmedia,jill terreri ramos
21148,mostlyfalse,chris abele,i have fought for our shared values without be...,news,dave umhoefer
21149,false,bloggers,germany halts all covid vaccines says they are...,blog,ciara orourke


## Tokenization

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
# Function to tokenize text using CountVectorizer (sklearn)
def tokenize_text(text):
    vectorizer = CountVectorizer()
    tokens = vectorizer.build_analyzer()(text)  # Tokenize text using CountVectorizer's analyzer
    return tokens

# Apply tokenization function to all text columns in the DataFrame
for col in df.columns:
    if df[col].dtype == 'object':  # Check if column contains text data
        df[col] = df[col].apply(lambda x: tokenize_text(str(x)))
df

Unnamed: 0,verdict,statement_originator,statement,statement_source,factchecker
0,[true],"[barack, obama]","[john, mccain, opposed, bankruptcy, protection...",[speech],"[adriel, bettelheim]"
1,[false],"[matt, gaetz]","[bennie, thompson, actively, cheerled, riots, ...",[television],"[yacob, reyes]"
2,[mostlytrue],"[kelly, ayotte]","[says, maggie, hassan, was, out, of, state, on...",[news],"[clay, wirestone]"
3,[false],[bloggers],"[busted, cdc, inflated, covid, numbers, accuse...",[blog],"[madison, czopek]"
4,[halftrue],"[bobby, jindal]","[am, the, only, republican, candidate, that, h...",[television],"[linda, qiu]"
...,...,...,...,...,...
21146,[mostlyfalse],"[donald, trump]","[says, the, large, trade, deficit, with, japan...",[speech],"[jon, greenberg]"
21147,[false],"[donald, trump, jr]","[tens, of, thousands, of, people, leave, new, ...",[socialmedia],"[jill, terreri, ramos]"
21148,[mostlyfalse],"[chris, abele]","[have, fought, for, our, shared, values, witho...",[news],"[dave, umhoefer]"
21149,[false],[bloggers],"[germany, halts, all, covid, vaccines, says, t...",[blog],"[ciara, orourke]"


## Removing Stop Words

In [21]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Custom function to preprocess text by removing stop words
def preprocess_text(text):
    # Handle the case where text is a list of strings
    if isinstance(text, list):
        text = ' '.join(text)

    # Split text into tokens
    tokens = text.split()

    # Convert tokens to lowercase and remove stop words
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in ENGLISH_STOP_WORDS]

    # Join filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

# Iterate over each column in the DataFrame
for col in df.columns:
    if df[col].dtype == 'object':  # Check if the column contains text data
        df[col] = df[col].apply(preprocess_text)
df

Unnamed: 0,verdict,statement_originator,statement,statement_source,factchecker
0,true,barack obama,john mccain opposed bankruptcy protections fam...,speech,adriel bettelheim
1,false,matt gaetz,bennie thompson actively cheerled riots,television,yacob reyes
2,mostlytrue,kelly ayotte,says maggie hassan state days months,news,clay wirestone
3,false,bloggers,busted cdc inflated covid numbers accused viol...,blog,madison czopek
4,halftrue,bobby jindal,republican candidate actually reduced size gov...,television,linda qiu
...,...,...,...,...,...
21146,mostlyfalse,donald trump,says large trade deficit japan stems sending m...,speech,jon greenberg
21147,false,donald trump jr,tens thousands people leave new york week,socialmedia,jill terreri ramos
21148,mostlyfalse,chris abele,fought shared values ideologue partisan,news,dave umhoefer
21149,false,bloggers,germany halts covid vaccines says unsafe longe...,blog,ciara orourke


## Lemmatization

In [22]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
# Initialize NLTK's WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to perform lemmatization on a single word
def lemmatize_word(word):
    # Determine the Part of Speech (POS) tag for the word
    pos_tag = nltk.pos_tag([word])[0][1][0].upper()
    pos_tag = pos_tag if pos_tag in ['A', 'N', 'V'] else 'N'  # Map POS tags to WordNet POS tags

    # Perform lemmatization based on the determined POS tag
    return lemmatizer.lemmatize(word, pos=pos_tag.lower())

# Define a function to perform lemmatization on a list of tokens
def lemmatize_tokens(tokens):
    lemmatized_tokens = [lemmatize_word(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization to all text columns in the DataFrame
for col in df.columns:
    if df[col].dtype == 'object':  # Check if the column contains text data
        df[col] = df[col].apply(lambda x: lemmatize_tokens(nltk.word_tokenize(x)) if isinstance(x, str) else x)
df

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\annma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\annma\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\annma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,verdict,statement_originator,statement,statement_source,factchecker
0,true,barack obama,john mccain oppose bankruptcy protection famil...,speech,adriel bettelheim
1,false,matt gaetz,bennie thompson actively cheerled riot,television,yacob reyes
2,mostlytrue,kelly ayotte,say maggie hassan state day month,news,clay wirestone
3,false,blogger,bust cdc inflate covid number accuse violate f...,blog,madison czopek
4,halftrue,bobby jindal,republican candidate actually reduce size gove...,television,linda qiu
...,...,...,...,...,...
21146,mostlyfalse,donald trump,say large trade deficit japan stem send millio...,speech,jon greenberg
21147,false,donald trump jr,ten thousand people leave new york week,socialmedia,jill terreri ramos
21148,mostlyfalse,chris abele,fought share value ideologue partisan,news,dave umhoefer
21149,false,blogger,germany halt covid vaccine say unsafe longer r...,blog,ciara orourke


## Removing Whitespaces

In [23]:
# Function to clean white spaces in text
def clean_whitespace(text):
    if isinstance(text, str):  # Check if the value is a string
        # Remove leading and trailing spaces
        cleaned_text = text.strip()
        # Replace multiple spaces with a single space using regex
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        return cleaned_text
    else:
        return text  # Return non-string values as is

# Apply whitespace cleaning function to all text columns in the DataFrame
for col in df.columns:
    if df[col].dtype == 'object':  # Check if the column contains text data
        df[col] = df[col].apply(clean_whitespace)
df

Unnamed: 0,verdict,statement_originator,statement,statement_source,factchecker
0,true,barack obama,john mccain oppose bankruptcy protection famil...,speech,adriel bettelheim
1,false,matt gaetz,bennie thompson actively cheerled riot,television,yacob reyes
2,mostlytrue,kelly ayotte,say maggie hassan state day month,news,clay wirestone
3,false,blogger,bust cdc inflate covid number accuse violate f...,blog,madison czopek
4,halftrue,bobby jindal,republican candidate actually reduce size gove...,television,linda qiu
...,...,...,...,...,...
21146,mostlyfalse,donald trump,say large trade deficit japan stem send millio...,speech,jon greenberg
21147,false,donald trump jr,ten thousand people leave new york week,socialmedia,jill terreri ramos
21148,mostlyfalse,chris abele,fought share value ideologue partisan,news,dave umhoefer
21149,false,blogger,germany halt covid vaccine say unsafe longer r...,blog,ciara orourke


In [24]:
df['statement']

0        john mccain oppose bankruptcy protection famil...
1                   bennie thompson actively cheerled riot
2                        say maggie hassan state day month
3        bust cdc inflate covid number accuse violate f...
4        republican candidate actually reduce size gove...
                               ...                        
21146    say large trade deficit japan stem send millio...
21147              ten thousand people leave new york week
21148                fought share value ideologue partisan
21149    germany halt covid vaccine say unsafe longer r...
21150    say healthy people experience mild moderate re...
Name: statement, Length: 21151, dtype: object

In [25]:
#df.to_csv('data_cleaned.csv', index = False)

# Bag of Words

In [26]:
# vectorizer = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS))

# X_bow = vectorizer.fit_transform(df['statement'])
# vocabulary = vectorizer.get_feature_names_out()
# bow_df = pd.DataFrame(X_bow.toarray(), columns=vocabulary)
# df_bow = pd.concat([df, bow_df], axis=1)
# df_bow
# TF-IDF Vectorizer

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(max_df=0.85, ngram_range=(1, 2))

X_tfid = tfid.fit_transform(df['statement'])
avg_tfidf = X_tfid.mean(axis=0)
X_tfid.shape

(21151, 134600)

In [28]:
min_tfidf_threshold = 0.0001  # Minimum TF-IDF score
max_tfidf_threshold = 0.9   # Maximum TF-IDF score (as a percentage of maximum TF-IDF score)

# Filter out words based on TF-IDF score thresholds
selected_features = [
    word for word, tfidf in zip(tfid.get_feature_names_out(), avg_tfidf.tolist()[0])
    if min_tfidf_threshold <= tfidf <= max_tfidf_threshold * max(avg_tfidf.tolist()[0])
]
len(selected_features)


4313

In [29]:
tfidf_vectorizer_filtered = TfidfVectorizer(vocabulary=selected_features)

# Fit and transform the filtered text data
X_tfidf_filtered = tfidf_vectorizer_filtered.fit_transform(df['statement'])
tfidf_df = pd.DataFrame(X_tfidf_filtered.toarray(), columns=selected_features)
tfidf_df.shape

(21151, 4313)

Since there are no other numerical columns except the vectorized statements, standardization or normalization is not necessary.

In [30]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

df['statement_source'] = encoder.fit_transform(df['statement_source'])
df['factchecker'] = encoder.fit_transform(df['factchecker'])
df

Unnamed: 0,verdict,statement_originator,statement,statement_source,factchecker
0,true,barack obama,john mccain oppose bankruptcy protection famil...,9,8
1,false,matt gaetz,bennie thompson actively cheerled riot,11,554
2,mostlytrue,kelly ayotte,say maggie hassan state day month,6,102
3,false,blogger,bust cdc inflate covid number accuse violate f...,2,351
4,halftrue,bobby jindal,republican candidate actually reduce size gove...,11,332
...,...,...,...,...,...
21146,mostlyfalse,donald trump,say large trade deficit japan stem send millio...,9,252
21147,false,donald trump jr,ten thousand people leave new york week,8,234
21148,mostlyfalse,chris abele,fought share value ideologue partisan,6,122
21149,false,blogger,germany halt covid vaccine say unsafe longer r...,2,99


In [31]:
df['statement_source'].unique()

array([ 9, 11,  6,  2,  0,  8,  1,  3,  5,  7,  4, 12, 10])

In [32]:
df['statement_originator'].nunique()

4551

# Feature Engineering

In [33]:
originator_freq = df['statement_originator'].value_counts(normalize=True)
originator_freq_map = originator_freq.to_dict()
df['statement_originator'] = df['statement_originator'].map(originator_freq_map)
df['statement_originator']

0        0.026618
1        0.000567
2        0.000520
3        0.033899
4        0.000473
           ...   
21146    0.044915
21147    0.000804
21148    0.000851
21149    0.033899
21150    0.090492
Name: statement_originator, Length: 21151, dtype: float64

In [34]:
df

Unnamed: 0,verdict,statement_originator,statement,statement_source,factchecker
0,true,0.026618,john mccain oppose bankruptcy protection famil...,9,8
1,false,0.000567,bennie thompson actively cheerled riot,11,554
2,mostlytrue,0.000520,say maggie hassan state day month,6,102
3,false,0.033899,bust cdc inflate covid number accuse violate f...,2,351
4,halftrue,0.000473,republican candidate actually reduce size gove...,11,332
...,...,...,...,...,...
21146,mostlyfalse,0.044915,say large trade deficit japan stem send millio...,9,252
21147,false,0.000804,ten thousand people leave new york week,8,234
21148,mostlyfalse,0.000851,fought share value ideologue partisan,6,122
21149,false,0.033899,germany halt covid vaccine say unsafe longer r...,2,99


In [35]:
df['verdict'].unique()

array(['true', 'false', 'mostlytrue', 'halftrue', 'pantsfire',
       'mostlyfalse'], dtype=object)

In [36]:
# from sklearn.preprocessing import OrdinalEncoder
# import numpy as np

# ordinal_encoder = OrdinalEncoder(categories=[['pantsfire', 'false', 'mostlyfalse', 'halftrue', 'mostlytrue', 'true']])
# df['verdict'] = ordinal_encoder.fit_transform(np.array(df['verdict']).reshape(-1, 1))
# df['verdict']

Concatenate the dataframe from the CountVectorizer for the statement column.

In [37]:
# tfidf_df = pd.concat([df['verdict'], tfidf_df], axis=1)
# tfidf_df.to_csv('tfidf_df.csv', index = False)

In [38]:
df_vector = pd.concat([df, tfidf_df], axis = 1)
df_vector

Unnamed: 0,verdict,statement_originator,statement,statement_source,factchecker,aarp,abandon,abbott,ability,able,...,young,young people,younger,youre,youre go,youth,youve,zero,zika,zone
0,true,0.026618,john mccain oppose bankruptcy protection famil...,9,8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,false,0.000567,bennie thompson actively cheerled riot,11,554,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,mostlytrue,0.000520,say maggie hassan state day month,6,102,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,false,0.033899,bust cdc inflate covid number accuse violate f...,2,351,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,halftrue,0.000473,republican candidate actually reduce size gove...,11,332,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21146,mostlyfalse,0.044915,say large trade deficit japan stem send millio...,9,252,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21147,false,0.000804,ten thousand people leave new york week,8,234,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21148,mostlyfalse,0.000851,fought share value ideologue partisan,6,122,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21149,false,0.033899,germany halt covid vaccine say unsafe longer r...,2,99,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
df_vector = df_vector.drop(columns=['statement'])
df_vector

Unnamed: 0,verdict,statement_originator,statement_source,factchecker,aarp,abandon,abbott,ability,able,abolish,...,young,young people,younger,youre,youre go,youth,youve,zero,zika,zone
0,true,0.026618,9,8,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,false,0.000567,11,554,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,mostlytrue,0.000520,6,102,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,false,0.033899,2,351,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,halftrue,0.000473,11,332,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21146,mostlyfalse,0.044915,9,252,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21147,false,0.000804,8,234,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21148,mostlyfalse,0.000851,6,122,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21149,false,0.033899,2,99,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
df['factchecker'].nunique()

563

In [41]:
df_vector.to_csv('data_preprocessed.csv', index = False)

In [49]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

scaler = MinMaxScaler()

X = df_vector.drop(columns=['verdict'])
y = df_vector['verdict']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [50]:
X_train = pd.DataFrame(X_scaled, columns=X.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X.columns)
y_train = pd.DataFrame(y_train, columns=['verdict'])
y_test = pd.DataFrame(y_test, columns=['verdict'])

In [51]:
X_train.to_csv('X_train.csv', index = False)
X_test.to_csv('X_test.csv', index = False)
y_train.to_csv('y_train.csv', index = False)
y_test.to_csv('y_test.csv', index = False)

In [None]:
# from sklearn.model_selection import train_test_split

# X = df.drop(columns=['verdict'])  # Features
# y = df['verdict']  # Target variable
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# print("X_train shape:", X_train.shape)
# print("X_test shape:", X_test.shape)
# print("y_train shape:", y_train.shape)
# print("y_test shape:", y_test.shape)