In [1]:
import pandas as pd
import nltk
from nltk.probability import *
from itertools import chain

In [2]:
# Path for liar data
base_path = "../data/liar/"

# Load the liar files
train_df = pd.read_csv(f"{base_path}train.tsv", sep="\t", header=None)
val_df   = pd.read_csv(f"{base_path}valid.tsv", sep="\t", header=None)
test_df  = pd.read_csv(f"{base_path}test.tsv", sep="\t", header=None)

In [3]:
# Rename columns
train_df = train_df.rename(columns={0: "ID", 1: "Label", 2: "Statement", 3: "Subject", 4: "Speaker", 5: "Title", 6: "State", 7: "Party", 8: "True Counts", 9: "Flase Counts", 10: "Half True Counts", 11: "Mostly True Counts", 12: "Pants On Fire Counts", 13: "Context"})
val_df = val_df.rename(columns={0: "ID", 1: "Label", 2: "Statement", 3: "Subject", 4: "Speaker", 5: "Title", 6: "State", 7: "Party", 8: "True Counts", 9: "Flase Counts", 10: "Half True Counts", 11: "Mostly True Counts", 12: "Pants On Fire Counts", 13: "Context"})
test_df = test_df.rename(columns={0: "ID", 1: "Label", 2: "Statement", 3: "Subject", 4: "Speaker", 5: "Title", 6: "State", 7: "Party", 8: "True Counts", 9: "Flase Counts", 10: "Half True Counts", 11: "Mostly True Counts", 12: "Pants On Fire Counts", 13: "Context"})

In [4]:
print("train data shape:", train_df.shape)
print("validate data shape:", val_df.shape)
print("testing data shape:", test_df.shape)

train data shape: (10240, 14)
validate data shape: (1284, 14)
testing data shape: (1267, 14)


In [5]:
train_df.head()

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Title,State,Party,True Counts,Flase Counts,Half True Counts,Mostly True Counts,Pants On Fire Counts,Context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [6]:
val_df.head()

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Title,State,Party,True Counts,Flase Counts,Half True Counts,Mostly True Counts,Pants On Fire Counts,Context
0,12134.json,barely-true,We have less Americans working now than in the...,"economy,jobs",vicky-hartzler,U.S. Representative,Missouri,republican,1,0,1,0,0,an interview with ABC17 News
1,238.json,pants-fire,"When Obama was sworn into office, he DID NOT u...","obama-birth-certificate,religion",chain-email,,,none,11,43,8,5,105,
2,7891.json,false,Says Having organizations parading as being so...,"campaign-finance,congress,taxes",earl-blumenauer,U.S. representative,Oregon,democrat,0,1,1,1,0,a U.S. Ways and Means hearing
3,8169.json,half-true,Says nearly half of Oregons children are poor.,poverty,jim-francesconi,Member of the State Board of Higher Education,Oregon,none,0,1,1,1,0,an opinion article
4,929.json,half-true,On attacks by Republicans that various program...,"economy,stimulus",barack-obama,President,Illinois,democrat,70,71,160,163,9,interview with CBS News


In [7]:
test_df.head()

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Title,State,Party,True Counts,Flase Counts,Half True Counts,Mostly True Counts,Pants On Fire Counts,Context
0,11972.json,true,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview
1,11685.json,false,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,State representative,Wisconsin,democrat,2,1,0,0,0,a news conference
2,11096.json,false,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,President-Elect,New York,republican,63,114,51,37,61,comments on ABC's This Week.
3,5209.json,half-true,Suzanne Bonamici supports a plan that will cut...,"medicare,message-machine-2012,campaign-adverti...",rob-cornilles,consultant,Oregon,republican,1,1,3,1,1,a radio show
4,9524.json,pants-fire,When asked by a reporter whether hes at the ce...,"campaign-finance,legal-issues,campaign-adverti...",state-democratic-party-wisconsin,,Wisconsin,democrat,5,7,2,2,7,a web video


In [8]:
# Combine test and val for simplicity
df = pd.concat([train_df, val_df, test_df], ignore_index=True)

In [10]:
df['Party'].value_counts()

Party
republican                      5665
democrat                        4137
none                            2181
organization                     264
independent                      180
newsmaker                         64
libertarian                       51
journalist                        49
activist                          45
columnist                         44
talk-show-host                    32
state-official                    24
labor-leader                      15
business-leader                   11
tea-party-member                  10
education-official                 3
green                              3
constitution-party                 3
government-body                    2
county-commissioner                2
liberal-party-canada               1
Moderate                           1
democratic-farmer-labor            1
ocean-state-tea-party-action       1
Name: count, dtype: int64

In [19]:
print("train data shape:", df.shape)

train data shape: (12791, 14)


In [21]:
def lowerStringCols(df):
    for col in df.columns:
        if df[col].dtype == "O": # If column data type is an object
            df[col] = df[col].str.lower()
    return df

In [23]:
df = lowerStringCols(df)

In [25]:
# Extract information about the review.
df_statement_data = df['Statement']

In [27]:
# Tokenize each clothing review. 
def tokenize(review):
    pattern = r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?"
    tokenizer = nltk.RegexpTokenizer(pattern) 
    tokenized_review = tokenizer.tokenize(review)
    return tokenized_review

In [29]:
# Create token lists
tokenized_df_statement_data = [tokenize(statement) for statement in df_statement_data]

In [31]:
# Remove words with a length less than 2
def removeWordsWithLessThanTwoChar(statement_list):
    
    statement_list = [
        [token for token in sublist if len(token) > 1]
        for sublist in statement_list
    ]
    
    return statement_list

In [33]:
tokenized_df_statement_data = removeWordsWithLessThanTwoChar(tokenized_df_statement_data)

In [35]:
# Create stop words list
stop_words = []

with open("stopwords-en.txt", "r", encoding="utf-8") as file:
    for word in file:
        # Remove /n from the word
        word = word.strip()
        
        stop_words.append(word)

In [37]:
# Remove stop words from statement lists
def removeStopWords(statement_list):
    
    statement_list = [
        [token for token in sublist if token not in stop_words]
        for sublist in statement_list
    ]
    
    return statement_list

In [39]:
tokenized_df_statement_data = removeStopWords(tokenized_df_statement_data)

In [41]:
tokenized_df_statement_data

[['annies',
  'list',
  'political',
  'supports',
  'third-trimester',
  'abortions',
  'demand'],
 ['decline',
  'coal',
  'start',
  'started',
  'natural',
  'gas',
  'started',
  'president',
  'george',
  'bushs',
  'administration'],
 ['hillary',
  'clinton',
  'agrees',
  'john',
  'mccain',
  'voting',
  'george',
  'bush',
  'benefit',
  'doubt',
  'iran'],
 ['health',
  'care',
  'reform',
  'legislation',
  'mandate',
  'sex',
  'change',
  'surgeries'],
 ['economic', 'turnaround', 'started', 'term'],
 ['chicago',
  'bears',
  'starting',
  'quarterbacks',
  'total',
  'tenured',
  'uw',
  'faculty',
  'fired',
  'decades'],
 ['jim', 'dunnam', 'lived', 'district', 'represents'],
 ['person',
  'stage',
  'actively',
  'passing',
  'russ',
  'feingold',
  'toughest',
  'ethics',
  'reform',
  'watergate'],
 ['oregon',
  'lottery',
  'funds',
  'port',
  'newport',
  'eventually',
  'land',
  'noaa',
  'marine',
  'operations',
  'center-pacific'],
 ['gop',
  'primary',
  'opp

In [45]:
# Remove words that are only shown once
def removeWordsWithOneAppearance(statement_data):
    words = list(chain.from_iterable(statement_data))
    
    words_frequency = FreqDist(words)
    
    one_appearance = [word for word in words if words_frequency[word] == 1]
    
    statement_data = [[token for token in sublist if word not in one_appearance] for sublist in statement_data]
    
    return statement_data

In [47]:
tokenized_df_statement_data = removeWordsWithOneAppearance(tokenized_df_statement_data)

In [48]:
# Add the tokens to their respective datasets
df['Statement'] = tokenized_df_statement_data

In [51]:
df['Statement'] = df['Statement'].apply(lambda x: ' '.join(x))

In [53]:
# Change labels to binary for model training

def changeLabels(statement_data):
    statement_data['Label'] = statement_data['Label'].apply(lambda x: 1 
                                                            if x == 'true' or x == 'mostly-true' 
                                                            or x == 'half-true' else 0)
    return statement_data

In [55]:
df = changeLabels(df)

In [57]:
df.head()

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Title,State,Party,True Counts,Flase Counts,Half True Counts,Mostly True Counts,Pants On Fire Counts,Context
0,2635.json,0,annies list political supports third-trimester...,abortion,dwayne-bohac,state representative,texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,1,decline coal start started natural gas started...,"energy,history,job-accomplishments",scott-surovell,state delegate,virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,1,hillary clinton agrees john mccain voting geor...,foreign-policy,barack-obama,president,illinois,democrat,70.0,71.0,160.0,163.0,9.0,denver
3,1123.json,0,health care reform legislation mandate sex cha...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,1,economic turnaround started term,"economy,jobs",charlie-crist,,florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on cnn


In [72]:
statement_with_any_null = df[df['Statement'].isnull()]

In [74]:
statement_with_any_null

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Title,State,Party,True Counts,Flase Counts,Half True Counts,Mostly True Counts,Pants On Fire Counts,Context


In [76]:
df.to_excel("tokenized_training_data.xlsx")