Subtask B: Verifiable factual claims detection: Given a tweet, predict whether it contains a verifiable factual claim. This is a binary task with two labels: Yes and No. This is a classification task

check this: 
- https://github.com/avirup88/Binary-Classification-using-N-Gram-Model-on-Text-Data
- https://stackoverflow.com/questions/48003907/how-to-train-naive-bayes-classifier-for-n-gram-movie-reviews

In [208]:
import pandas as pd
import numpy as np
import re
from scipy.sparse import csr_matrix
from collections import Counter
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier


## Preprocess

In [209]:
stop_words = set(stopwords.words("english"))

In [210]:
df = pd.read_csv("data/1b/CT22_english_1B_claim_train.tsv", sep='\t')

In [211]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3324 entries, 0 to 3323
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   topic        3324 non-null   object 
 1   tweet_id     3324 non-null   float64
 2   tweet_url    3324 non-null   object 
 3   tweet_text   3324 non-null   object 
 4   class_label  3324 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 130.0+ KB


In [212]:
df.isnull().sum()

topic          0
tweet_id       0
tweet_url      0
tweet_text     0
class_label    0
dtype: int64

In [213]:
df

Unnamed: 0,topic,tweet_id,tweet_url,tweet_text,class_label
0,COVID-19,1.359351e+18,http://twitter.com/user/status/135935094335617...,"India's gift of 100,000 COVID-19 vaccines arri...",1
1,COVID-19,1.350166e+18,http://twitter.com/user/status/135016568806166...,Here’s what I’m doing while I wait my turn for...,0
2,COVID-19,1.369750e+18,http://twitter.com/user/status/136974953915491...,"This afternoon, I’m hosting an event with the ...",0
3,COVID-19,1.350165e+18,http://twitter.com/user/status/135016499568693...,"Help shops like mine stay open. Mask up, avoid...",0
4,COVID-19,1.370008e+18,http://twitter.com/user/status/137000807648978...,As part of the ongoing nationwide vaccination ...,1
...,...,...,...,...,...
3319,COVID-19,1.260271e+18,http://twitter.com/user/status/126027105220003...,"Some pearl of Wisdom by your fuckeer ""Bill Gat...",1
3320,COVID-19,1.256023e+18,http://twitter.com/user/status/125602347058187...,Top infectious disease expert Dr. Anthony Fauc...,1
3321,COVID-19,1.256158e+18,http://twitter.com/user/status/125615803982614...,Everyone is looking at Bill Gates for the Coro...,1
3322,COVID-19,1.233524e+18,http://twitter.com/user/status/123352378449100...,The president’s first instinct in response to ...,1


In [214]:
df["class_label"].value_counts()

1    2122
0    1202
Name: class_label, dtype: int64

In [215]:
class_label = df["class_label"].values
class_label.shape

(3324,)

In [216]:
tweet_txt = df["tweet_text"].values
tweet_txt.shape

(3324,)

In [217]:
df1 = pd.DataFrame(tweet_txt)
df1 = df1.rename(columns={0:'tweet'})

In [218]:
df2 = pd.DataFrame(class_label)
df2 = df2.rename(columns={0:'label'})
new_df = pd.concat([df1, df2], axis=1)

In [219]:
new_df.head()

Unnamed: 0,tweet,label
0,"India's gift of 100,000 COVID-19 vaccines arri...",1
1,Here’s what I’m doing while I wait my turn for...,0
2,"This afternoon, I’m hosting an event with the ...",0
3,"Help shops like mine stay open. Mask up, avoid...",0
4,As part of the ongoing nationwide vaccination ...,1


## Preprocessing

In [220]:
def preprocess(article):
    doc_list = []
    for art in article:
        art = re.sub('[^A-Za-z0-9]+', ' ', art.lower())
        content_tokens = word_tokenize(art)
        doc = [word for word in  content_tokens if not word.lower() in stop_words]
        doc = [n for n in doc if not n.isdigit()]
        doc_list.append(" ".join(doc))
        
    return doc_list


In [221]:
new_df['tweet'] = preprocess(new_df['tweet'])

In [None]:
new_df.head()

## Ngrams

In [197]:
def getNgrams(words, n = 2):
    
    ngram_vocab = ngrams(words.split(), n)
    my_dict = dict([(ng, True) for ng in ngram_vocab])
    return my_dict

## Naive bayes classifier

In [204]:
resultDict = {}

for n in [1,2,3,4,5]:
    trueList = []
    falseList = []
    for line in new_df[new_df.label == 1].tweet:
        trueList.append((getNgrams(line, n), 'true'))
    for line in new_df[new_df.label == 0].tweet:
        falseList.append((getNgrams(line, n), 'false'))
        
    trainset = trueList[:800] + falseList[:800]
    #testset = trueList[800:] + falseList[800:]
        
    classifier = NaiveBayesClassifier.train(trainset)
    
    resultDict[n] = classifier
    # accuracy = nltk.classify.util.accuracy(classifier, testset)
    # print(str(n)+ '-gram accuracy:', accuracy)

## Preprocess of test data

In [206]:
test_df = pd.read_csv("data/1b/CT22_english_1B_claim_dev_test.tsv", sep='\t')

In [207]:
test_df

Unnamed: 0,topic,tweet_id,tweet_url,tweet_text,class_label
0,COVID-19,1368273275181207552,http://twitter.com/user/status/136827327518120...,"The Senate just passed COVID relief. ✔️ $1,4...",1
1,COVID-19,1368830691300020225,http://twitter.com/user/status/136883069130002...,All seven COVID-19 vaccines that have complete...,1
2,COVID-19,1367771405647933444,http://twitter.com/user/status/136777140564793...,In India the vaccination program is progressin...,1
3,COVID-19,1354211785293746177,http://twitter.com/user/status/135421178529374...,Vaccines are here. But it's important that we ...,1
4,COVID-19,1359553863590678533,http://twitter.com/user/status/135955386359067...,BREAKING @ians_india: Canadian Prime Minister ...,0
...,...,...,...,...,...
906,COVID-19,1284918427228893184,http://twitter.com/user/status/128491842722889...,Moderna’s experimental coronavirus vaccine mov...,1
907,COVID-19,1286604745818349568,http://twitter.com/user/status/128660474581834...,Morning - I drink gaumutra Lunch- I have Patan...,0
908,COVID-19,1297055497984421888,http://twitter.com/user/status/129705549798442...,Somehow I feel that this high level academic d...,0
909,COVID-19,1236601634454528000,http://twitter.com/user/status/123660163445452...,"Not just Manila, pero pwede bang buong ncr l, ...",0


In [222]:
class_label_test = test_df["class_label"].values
tweet_txt_test = test_df["tweet_text"].values


In [223]:
df1_test = pd.DataFrame(tweet_txt_test)
df1_test = df1_test.rename(columns={0:'tweet'})

In [225]:
df2_test = pd.DataFrame(class_label_test)
df2_test = df2_test.rename(columns={0:'label'})
new_test_df = pd.concat([df1_test, df2_test], axis=1)
new_test_df.head()

Unnamed: 0,tweet,label
0,"The Senate just passed COVID relief. ✔️ $1,4...",1
1,All seven COVID-19 vaccines that have complete...,1
2,In India the vaccination program is progressin...,1
3,Vaccines are here. But it's important that we ...,1
4,BREAKING @ians_india: Canadian Prime Minister ...,0


In [226]:
new_test_df['tweet'] = preprocess(new_test_df['tweet'])

In [232]:
test_resultDict = {}

for n in [1,2,3,4,5]:
    trueList = []
    falseList = []
    for line in new_test_df[new_test_df.label == 1].tweet:
        trueList.append((getNgrams(line, n), 'true'))
    for line in new_test_df[new_test_df.label == 0].tweet:
        falseList.append((getNgrams(line, n), 'false'))
        
    testset = trueList+ falseList
    
    test_resultDict[n] = testset

## Benchmarking

In [231]:
for n in test_resultDict:
    classifier = resultDict[n]
    testset = test_resultDict[n]
    accuracy = nltk.classify.util.accuracy(classifier, testset)
    print(str(n)+ '-gram accuracy:', accuracy)

1-gram accuracy: 0.6849615806805708
2-gram accuracy: 0.6421514818880352
3-gram accuracy: 0.5971459934138309
4-gram accuracy: 0.6114160263446762
5-gram accuracy: 0.6300768386388584
