Subtask B: Verifiable factual claims detection: Given a tweet, predict whether it contains a verifiable factual claim. This is a binary task with two labels: Yes and No. This is a classification task

check this: 
- https://github.com/avirup88/Binary-Classification-using-N-Gram-Model-on-Text-Data
- https://stackoverflow.com/questions/48003907/how-to-train-naive-bayes-classifier-for-n-gram-movie-reviews

In [None]:
import pandas as pd
import re
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier


## Preprocess

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
df = pd.read_csv("data/1b/CT22_english_1B_claim_train.tsv", sep='\t')

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df

In [None]:
df["class_label"].value_counts()

In [None]:
true_df = df.loc[df['class_label'] == 1]
true_df_1000 = true_df[:1000]

In [None]:
false_df = df.loc[df['class_label'] == 0]
false_df_1000 = true_df[:1000]

In [None]:
frames = [true_df_1000, false_df_1000]
df_v2 = pd.concat(frames)
df_v2

In [None]:
class_label = df["class_label"].values
#class_label = df_v2["class_label"].values
class_label.shape

In [None]:
tweet_txt = df["tweet_text"].values
#tweet_txt = df_v2["tweet_text"].values
tweet_txt.shape

In [None]:
df1 = pd.DataFrame(tweet_txt)
df1 = df1.rename(columns={0:'tweet'})

In [None]:
df2 = pd.DataFrame(class_label)
df2 = df2.rename(columns={0:'label'})
new_df = pd.concat([df1, df2], axis=1)

In [None]:
new_df.head()

## Preprocessing

In [None]:
def preprocess(article):
    doc_list = []
    for art in article:
        art = re.sub('[^A-Za-z0-9]+', ' ', art.lower())
        content_tokens = word_tokenize(art)
        doc = [word for word in  content_tokens if not word.lower() in stop_words]
        doc = [n for n in doc if not n.isdigit()]
        doc_list.append(" ".join(doc))
        
    return doc_list


In [None]:
new_df['tweet'] = preprocess(new_df['tweet'])

In [None]:
new_df.head()

## Ngrams

In [None]:
eng_words = pd.read_csv("words.txt", sep=" ")

In [None]:
blacklisted = ["http", "https", "co", "twitter", "com"]

In [None]:
# def getNgrams(words, n = 2):
#     wordList = []
#     for i in words.split():
#         if i in eng_words.values:
#             wordList.append(i)
#     ngram_vocab = ngrams(wordList, n)
#     my_dict = dict([(ng, True) for ng in ngram_vocab])  
#     return my_dict  
  
    
# def getNgrams(words, n = 2):
#     ngram_vocab = ngrams(words.split(), n)
#     my_dict = dict([(ng, True) for ng in ngram_vocab])  
#     return my_dict  
  
  
def getNgrams(words, n = 2):
    wordList = []
    for i in words.split():
        if i not in blacklisted:
            wordList.append(i)
    ngram_vocab = ngrams(wordList, n)
    my_dict = dict([(ng, True) for ng in ngram_vocab])  
    return my_dict  

## Naive bayes classifier

In [None]:
resultDict = {}

for n in [1,2,3,4,5]:
    trueList = []
    falseList = []
    for line in new_df[new_df.label == 1].tweet:
        trueList.append((getNgrams(line, n), 'true'))
    for line in new_df[new_df.label == 0].tweet:
        falseList.append((getNgrams(line, n), 'false'))
    
    trainset = trueList + falseList
        
    classifier = NaiveBayesClassifier.train(trainset)
    
    resultDict[n] = classifier

In [None]:
new_df

## Preprocess of test data

In [None]:
test_df = pd.read_csv("data/1b/CT22_english_1B_claim_dev_test.tsv", sep='\t')

In [None]:
test_df

In [None]:
class_label_test = test_df["class_label"].values
tweet_txt_test = test_df["tweet_text"].values


In [None]:
df1_test = pd.DataFrame(tweet_txt_test)
df1_test = df1_test.rename(columns={0:'tweet'})

In [None]:
df2_test = pd.DataFrame(class_label_test)
df2_test = df2_test.rename(columns={0:'label'})
new_test_df = pd.concat([df1_test, df2_test], axis=1)
new_test_df.head()

In [None]:
new_test_df['tweet'] = preprocess(new_test_df['tweet'])

In [None]:
test_resultDict = {}

for n in [1,2,3,4,5]:
    trueList = []
    falseList = []
    for line in new_test_df[new_test_df.label == 1].tweet:
        trueList.append((getNgrams(line, n), 'true'))
    for line in new_test_df[new_test_df.label == 0].tweet:
        falseList.append((getNgrams(line, n), 'false'))
        
    testset = trueList + falseList
    
    test_resultDict[n] = testset

## Benchmarking

In [None]:
for n in test_resultDict:
    classifier = resultDict[n]
    testset = test_resultDict[n]
    accuracy = nltk.classify.util.accuracy(classifier, testset)
    print(str(n)+ '-gram accuracy:', accuracy)

In [None]:
from sklearn.naive_bayes import ComplementNB
from sklearn.feature_extraction.text import CountVectorizer

clf = ComplementNB()
X = new_df.drop(["label"], axis = 1)
Y = new_df["label"]

X_dict = {}
for n in [1,2,3,4,5]:
    X_List = []
    for x in X.values:
        X_List.append(str(getNgrams(x[0], n)))
    X_dict[n] = X_List
    
asdf = pd.DataFrame.from_dict(X_dict)
ayo = new_df.join(asdf, how="left")

docs = ayo[1].values

vec = CountVectorizer()
X1 = vec.fit_transform(docs)

df1 = pd.DataFrame(X1.toarray(), columns=vec.get_feature_names())

clf.fit(df1, Y)
print(clf.predict(df1))


In [None]:

X_test = new_test_df.drop(["label"], axis = 1)
Y_test = new_test_df["label"]

X_test_dict = {}
for n in [1,2,3,4,5]:
    X_List = []
    for x in X.values:
        X_List.append(str(getNgrams(x[0], n)))
    X_test_dict[n] = X_List
    
asdf2 = pd.DataFrame.from_dict(X_test_dict)
ayo2 = new_test_df.join(asdf2, how="left")

docs2 = ayo2[1].values
X1_test = vec.fit_transform(docs2)

df1_test = pd.DataFrame(X1_test.toarray(), columns=vec.get_feature_names())

clf.score(df1_test, Y_test)

In [415]:
df1_test.shape

(911, 6215)

In [416]:
df1.shape

(3324, 15162)