Subtask B: Verifiable factual claims detection: Given a tweet, predict whether it contains a verifiable factual claim. This is a binary task with two labels: Yes and No. This is a classification task

check this: 
- https://github.com/avirup88/Binary-Classification-using-N-Gram-Model-on-Text-Data
- https://stackoverflow.com/questions/48003907/how-to-train-naive-bayes-classifier-for-n-gram-movie-reviews

In [1]:
import pandas as pd
import re
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier


## Preprocess

In [2]:
stop_words = set(stopwords.words("english"))

In [3]:
df = pd.read_csv("data/1b/CT22_english_1B_claim_train.tsv", sep='\t')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3324 entries, 0 to 3323
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   topic        3324 non-null   object 
 1   tweet_id     3324 non-null   float64
 2   tweet_url    3324 non-null   object 
 3   tweet_text   3324 non-null   object 
 4   class_label  3324 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 130.0+ KB


In [5]:
df.isnull().sum()

topic          0
tweet_id       0
tweet_url      0
tweet_text     0
class_label    0
dtype: int64

In [6]:
df

Unnamed: 0,topic,tweet_id,tweet_url,tweet_text,class_label
0,COVID-19,1.359351e+18,http://twitter.com/user/status/135935094335617...,"India's gift of 100,000 COVID-19 vaccines arri...",1
1,COVID-19,1.350166e+18,http://twitter.com/user/status/135016568806166...,Here’s what I’m doing while I wait my turn for...,0
2,COVID-19,1.369750e+18,http://twitter.com/user/status/136974953915491...,"This afternoon, I’m hosting an event with the ...",0
3,COVID-19,1.350165e+18,http://twitter.com/user/status/135016499568693...,"Help shops like mine stay open. Mask up, avoid...",0
4,COVID-19,1.370008e+18,http://twitter.com/user/status/137000807648978...,As part of the ongoing nationwide vaccination ...,1
...,...,...,...,...,...
3319,COVID-19,1.260271e+18,http://twitter.com/user/status/126027105220003...,"Some pearl of Wisdom by your fuckeer ""Bill Gat...",1
3320,COVID-19,1.256023e+18,http://twitter.com/user/status/125602347058187...,Top infectious disease expert Dr. Anthony Fauc...,1
3321,COVID-19,1.256158e+18,http://twitter.com/user/status/125615803982614...,Everyone is looking at Bill Gates for the Coro...,1
3322,COVID-19,1.233524e+18,http://twitter.com/user/status/123352378449100...,The president’s first instinct in response to ...,1


In [7]:
df["class_label"].value_counts()

1    2122
0    1202
Name: class_label, dtype: int64

In [8]:
true_df = df.loc[df['class_label'] == 1]
true_df_1000 = true_df[:1000]

In [9]:
false_df = df.loc[df['class_label'] == 0]
false_df_1000 = true_df[:1000]

In [10]:
frames = [true_df_1000, false_df_1000]
df_v2 = pd.concat(frames)
df_v2

Unnamed: 0,topic,tweet_id,tweet_url,tweet_text,class_label
0,COVID-19,1.359351e+18,http://twitter.com/user/status/135935094335617...,"India's gift of 100,000 COVID-19 vaccines arri...",1
4,COVID-19,1.370008e+18,http://twitter.com/user/status/137000807648978...,As part of the ongoing nationwide vaccination ...,1
5,COVID-19,1.367727e+18,http://twitter.com/user/status/136772673004420...,"Pleased to receive 50,000 doses of Covid-19 va...",1
7,COVID-19,1.369888e+18,http://twitter.com/user/status/136988848341471...,Four former presidents have banded together fo...,1
12,COVID-19,1.368585e+18,http://twitter.com/user/status/136858518750069...,WSJ: All three of Russia's main intelligence s...,1
...,...,...,...,...,...
1993,COVID-19,1.367308e+18,http://twitter.com/user/status/136730832289060...,“Proof-of-residency and ID requirements are su...,1
1994,COVID-19,1.367733e+18,http://twitter.com/user/status/136773301098089...,Pleased to address Finance Ministers at the Fi...,1
1995,COVID-19,1.369665e+18,http://twitter.com/user/status/136966502764132...,We know we can't beat COVID without equitable ...,1
1996,COVID-19,1.369328e+18,http://twitter.com/user/status/136932838111283...,The QUAD Leaders will discuss ongoing efforts ...,1


In [11]:
class_label = df["class_label"].values
#class_label = df_v2["class_label"].values
class_label.shape

(3324,)

In [12]:
tweet_txt = df["tweet_text"].values
#tweet_txt = df_v2["tweet_text"].values
tweet_txt.shape

(3324,)

In [13]:
df1 = pd.DataFrame(tweet_txt)
df1 = df1.rename(columns={0:'tweet'})

In [14]:
df2 = pd.DataFrame(class_label)
df2 = df2.rename(columns={0:'label'})
new_df = pd.concat([df1, df2], axis=1)

In [15]:
new_df.head()

Unnamed: 0,tweet,label
0,"India's gift of 100,000 COVID-19 vaccines arri...",1
1,Here’s what I’m doing while I wait my turn for...,0
2,"This afternoon, I’m hosting an event with the ...",0
3,"Help shops like mine stay open. Mask up, avoid...",0
4,As part of the ongoing nationwide vaccination ...,1


## Preprocessing

In [16]:
def preprocess(article):
    doc_list = []
    for art in article:
        art = re.sub('[^A-Za-z0-9]+', ' ', art.lower())
        content_tokens = word_tokenize(art)
        doc = [word for word in  content_tokens if not word.lower() in stop_words]
        doc = [n for n in doc if not n.isdigit()]
        doc_list.append(" ".join(doc))
        
    return doc_list


In [17]:
new_df['tweet'] = preprocess(new_df['tweet'])

In [18]:
new_df.head()

Unnamed: 0,tweet,label
0,india gift covid vaccines arrived barbados ear...,1
1,wait turn covid vaccines masking staying least...,0
2,afternoon hosting event ceos johnson amp johns...,0
3,help shops like mine stay open mask avoid crow...,0
4,part ongoing nationwide vaccination rollout se...,1


## Ngrams

In [19]:
eng_words = pd.read_csv("words.txt", sep=" ")

In [20]:
blacklisted = ["http", "https", "co", "twitter", "com"]

In [41]:
def getNgrams(words, n = 2):
    wordList = []
    for i in words.split():
        if i in eng_words.values:
            wordList.append(i)
    ngram_vocab = ngrams(wordList, n)
    my_dict = dict([(ng, True) for ng in ngram_vocab])  
    return my_dict  
  
    
# def getNgrams(words, n = 2):
#     ngram_vocab = ngrams(words.split(), n)
#     my_dict = dict([(ng, True) for ng in ngram_vocab])  
#     return my_dict  
  
  
# def getNgrams(words, n = 2):
#     wordList = []
#     for i in words.split():
#         if i not in blacklisted:
#             wordList.append(i)
#     ngram_vocab = ngrams(wordList, n)
#     my_dict = dict([(ng, True) for ng in ngram_vocab])  
#     return my_dict  

## Naive bayes classifier

In [22]:
resultDict = {}

for n in [1,2,3,4,5]:
    trueList = []
    falseList = []
    for line in new_df[new_df.label == 1].tweet:
        trueList.append((getNgrams(line, n), 'true'))
    for line in new_df[new_df.label == 0].tweet:
        falseList.append((getNgrams(line, n), 'false'))
    
    trainset = trueList + falseList
        
    classifier = NaiveBayesClassifier.train(trainset)
    
    resultDict[n] = classifier

In [23]:
new_df

Unnamed: 0,tweet,label
0,india gift covid vaccines arrived barbados ear...,1
1,wait turn covid vaccines masking staying least...,0
2,afternoon hosting event ceos johnson amp johns...,0
3,help shops like mine stay open mask avoid crow...,0
4,part ongoing nationwide vaccination rollout se...,1
...,...,...
3319,pearl wisdom fuckeer bill gates intentionally ...,1
3320,top infectious disease expert dr anthony fauci...,1
3321,everyone looking bill gates coronavirus vaccin...,1
3322,president first instinct response possible pan...,1


## Preprocess of test data

In [24]:
test_df = pd.read_csv("data/1b/CT22_english_1B_claim_dev_test.tsv", sep='\t')

In [25]:
test_df

Unnamed: 0,topic,tweet_id,tweet_url,tweet_text,class_label
0,COVID-19,1368273275181207552,http://twitter.com/user/status/136827327518120...,"The Senate just passed COVID relief. ✔️ $1,4...",1
1,COVID-19,1368830691300020225,http://twitter.com/user/status/136883069130002...,All seven COVID-19 vaccines that have complete...,1
2,COVID-19,1367771405647933444,http://twitter.com/user/status/136777140564793...,In India the vaccination program is progressin...,1
3,COVID-19,1354211785293746177,http://twitter.com/user/status/135421178529374...,Vaccines are here. But it's important that we ...,1
4,COVID-19,1359553863590678533,http://twitter.com/user/status/135955386359067...,BREAKING @ians_india: Canadian Prime Minister ...,0
...,...,...,...,...,...
906,COVID-19,1284918427228893184,http://twitter.com/user/status/128491842722889...,Moderna’s experimental coronavirus vaccine mov...,1
907,COVID-19,1286604745818349568,http://twitter.com/user/status/128660474581834...,Morning - I drink gaumutra Lunch- I have Patan...,0
908,COVID-19,1297055497984421888,http://twitter.com/user/status/129705549798442...,Somehow I feel that this high level academic d...,0
909,COVID-19,1236601634454528000,http://twitter.com/user/status/123660163445452...,"Not just Manila, pero pwede bang buong ncr l, ...",0


In [26]:
class_label_test = test_df["class_label"].values
tweet_txt_test = test_df["tweet_text"].values


In [27]:
df1_test = pd.DataFrame(tweet_txt_test)
df1_test = df1_test.rename(columns={0:'tweet'})

In [28]:
df2_test = pd.DataFrame(class_label_test)
df2_test = df2_test.rename(columns={0:'label'})
new_test_df = pd.concat([df1_test, df2_test], axis=1)
new_test_df.head()

Unnamed: 0,tweet,label
0,"The Senate just passed COVID relief. ✔️ $1,4...",1
1,All seven COVID-19 vaccines that have complete...,1
2,In India the vaccination program is progressin...,1
3,Vaccines are here. But it's important that we ...,1
4,BREAKING @ians_india: Canadian Prime Minister ...,0


In [29]:
new_test_df['tweet'] = preprocess(new_test_df['tweet'])

In [30]:
test_resultDict = {}

for n in [1,2,3,4,5]:
    trueList = []
    falseList = []
    for line in new_test_df[new_test_df.label == 1].tweet:
        trueList.append((getNgrams(line, n), 'true'))
    for line in new_test_df[new_test_df.label == 0].tweet:
        falseList.append((getNgrams(line, n), 'false'))
        
    testset = trueList + falseList
    
    test_resultDict[n] = testset

## Benchmarking

In [31]:
for n in test_resultDict:
    classifier = resultDict[n]
    testset = test_resultDict[n]
    accuracy = nltk.classify.util.accuracy(classifier, testset)
    print(str(n)+ '-gram accuracy:', accuracy)

1-gram accuracy: 0.7332601536772777
2-gram accuracy: 0.6564215148188803
3-gram accuracy: 0.6322722283205269
4-gram accuracy: 0.6223929747530187
5-gram accuracy: 0.6311745334796927


In [42]:
from sklearn.naive_bayes import ComplementNB #0.4972557628979144
from sklearn.naive_bayes import MultinomialNB #0.5016465422612514
from sklearn.naive_bayes import BernoulliNB #0.5082327113062569
from sklearn.naive_bayes import GaussianNB #0.47530186608122943
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.stem import WordNetLemmatizer 
import spacy

In [43]:
#lemmatizer = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [44]:

#token = RegexpTokenizer(r'[a-zA-Z0-9]+')
#cv = CountVectorizer(stop_words="english", ngram_range=(1,2), tokenizer=token.tokenize, max_features=3000)

clf = ComplementNB()
X = new_df.drop(["label"], axis = 1)
Y = new_df["label"]

X_dict = {}
for n in [1,2,3,4,5]:
    X_List = []
    for x in X.values:
        #word_list = nltk.word_tokenize(x[0])
        #lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
        doc = nlp(x[0])
        lemmatized_output = " ".join([token.lemma_ for token in doc])
        X_List.append(str(getNgrams(lemmatized_output, n)))
    X_dict[n] = X_List
    
asdf = pd.DataFrame.from_dict(X_dict)
ayo = new_df.join(asdf, how="left")

docs = ayo[1].values

vec = CountVectorizer()
vec_fit = vec.fit(docs)
X1 = vec_fit.transform(docs)

df1 = pd.DataFrame(X1.toarray(), columns=vec.get_feature_names())

clf.fit(df1, Y)




ComplementNB()

In [45]:

X_test = new_test_df.drop(["label"], axis = 1)
Y_test = new_test_df["label"]

X_test_dict = {}
for n in [1,2,3,4,5]:
    X_List = []
    for x in X.values:
        X_List.append(str(getNgrams(x[0], n)))
    X_test_dict[n] = X_List
    
asdf2 = pd.DataFrame.from_dict(X_test_dict)
ayo2 = new_test_df.join(asdf2, how="left")

docs2 = ayo2[1].values
X1_test = vec_fit.transform(docs2)

df1_test = pd.DataFrame(X1_test.toarray(), columns=vec.get_feature_names())

clf.score(df1_test, Y_test)



0.49396267837541163