# Data and Libraries import

In [1]:
from google.colab import drive, output
import pandas as pd
import spacy

drive.mount('/content/drive', force_remount=True)

%pwd
%cd drive/MyDrive/BERT_NLP/

# DATASET_PATH = '/content/drive/My Drive/'

Mounted at /content/drive
/content/drive/MyDrive/BERT_NLP


In [4]:
# @title Read Data

from sklearn.utils import shuffle

tweets_df = pd.read_csv('datasets/100_100_argument_dataset.csv', ',', header=0)

# cols = tweets_df[['Tweet', 'argument_score']]

# data = cols.copy()
# data.head()

cols = tweets_df[['Correctly Ordered Tweet', 'Argument Score']]
data = cols.copy()
data = shuffle(shuffle(data))

data.rename(columns={'Correctly Ordered Tweet':'Tweet', 'Argument Score':'argument_score'}, inplace=True)
data.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Tweet,argument_score
155,News gets around fast at the allotments. I'm s...,1
36,Democrat Diane Feinstein leans toward acquitti...,0
89,CDC abruptly postpones coronavirus press brief...,0
60,Anybody also attending the Chromatica Ball ope...,0
83,"Joaquin Phoenix wins best actor for ""Joker"" at...",0


# Data (Pre) Processing

In [5]:
nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words

In [6]:
doc = nlp("Its Super Bowl week, and Trump Hashtagsis using a prevent defense. The strategy sometimes backfires iN football. It may fail for Trump, too. ,")

# # # n = [token.sents for token in doc]
# # # n
# for sent in doc.sents:
#   print(sent.text)

for token in doc:
  print(f"{token} {token.pos_} {token.tag_}")

Its DET PRP$
Super PROPN NNP
Bowl PROPN NNP
week NOUN NN
, PUNCT ,
and CCONJ CC
Trump PROPN NNP
Hashtagsis PROPN NNP
using VERB VBG
a DET DT
prevent NOUN NN
defense NOUN NN
. PUNCT .
The DET DT
strategy NOUN NN
sometimes ADV RB
backfires VERB VBZ
iN ADP IN
football NOUN NN
. PUNCT .
It PRON PRP
may VERB MD
fail VERB VB
for ADP IN
Trump PROPN NNP
, PUNCT ,
too ADV RB
. PUNCT .
, PUNCT ,


In [9]:
from spacy import displacy

In [16]:
displacy.render(doc, style='dep', jupyter=True, options={'compact':True})

In [61]:
# @title Remove Links and Hashtags

import re
import string

def removeLinks(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def cleanAtTheRateAndHashtags(text):
    junk = ['\u2066','@', '#', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
    words = text.split()
    res =[]
    for word in words:
      # print(word[0])
      if word[0] not in junk:
        res.append(word)
    return " ".join(res)

def deEmojify(text):
    "function to remove emojis from text"
    regrex_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return regrex_pattern.sub(r'',text)

def cleanseText(text):
    text = removeLinks(text)
    # print(text)
    txt = cleanAtTheRateAndHashtags(text)
    # re.sub(r'[^\x00-\x7F]+','', text)
    # print(txt)
    txt = deEmojify(txt)
    return txt.encode('ascii', 'ignore').decode('ascii')


# clean df
data['Tweet'] = data['Tweet'].apply(lambda tweet: cleanseText(tweet))

data.head()

Unnamed: 0,Tweet,argument_score
131,Happy Birthday to the legend that is Bobby Har...,1
161,I think we can agree Dr Fauci is a great asset...,1
9,some big news!!!!!,0
77,BREAKING: McConnell tells GOP senators he does...,0
166,With bad comes good. Skin check today shows st...,1


In [62]:
# @title Prepare Bag of Words

BAG_OF_WORDS = [] #Bag of Words List
LIST_OF_IMP_PoS = ['CONJ', 'CCONJ', 'SCONJ', 'VERB', 'ADV']

def prepare_bow(tweets):
  
  for tweet in tweets:
    docs = nlp(tweet)
    
    for token in docs:
      if token.pos_ in LIST_OF_IMP_PoS:
        BAG_OF_WORDS.append(token.lemma_)

prepare_bow(data['Tweet'])

print(f'Total Number of Words in the Bag {len(BAG_OF_WORDS)}')


Total Number of Words in the Bag 1492


In [63]:
# @title Prepare Vector using BoWs Function

def get_bow_vector(text):
  vector = []
  lemmatized_text = [token.lemma_ for token in nlp(text)]
  for word in BAG_OF_WORDS:
    if word in lemmatized_text:
      vector.append(1)
    else:
      vector.append(0)
  return vector


# Model ETC Stuff

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [65]:
# import torch
from sklearn import svm

In [66]:
train_x_vectors = [get_bow_vector(text) for text in data['Tweet']]
train_y = data['argument_score']
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)


# while True:
#   output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

SVC(kernel='linear')

# Testing Now

In [67]:
# @title Prepare Test Set

preds_list = []
test_data = [['Trump, signing the $8.3B coronavirus bill w/Azar standing over him, was asked why he canceled his trip to CDC.  Azar quickly interjected that Trump had sent him to CDC instead.  Trump then clarified that the trip was scuttled b/c CDC was concerned an official there had the virus.', 0],
             ['Garfield from @StrayCatsWM has had his leg amputation surgery today. He’s recovering well and we’re confident that he’ll manage just fine on three legs 💙🐱 #CatsOfTwitter https://t.co/0ujCkeDiyv', 1],
             ['Adrienne Posta is 71 today, Happy Birthday Adrienne 🎂🎉 https://t.co/NACZgxouV9', 0],
             ['So I feel really hot and sweaty and my heads thumping .... found this mosquito bite on my leg this morning ... 🦟😭 https://t.co/1qertDJYdY', 0],
             ["The Press Secretary says that what President Trump “cares about is taxpayer money” and making sure “there is no corruption tied to it.”  That doesn't add up. Trump has spent more than $100 million in taxpayer money to golf at courses he still owns and profits from.", 1],
             ['David Jason is 80 today, Happy Birthday David 🎂🎉 https://t.co/iPgaLz5rzp', 0],
             ['I know #Coronavirus is spreading outside of China, but total active cases are still falling.  Active cases are the only ones that can spread the virus.  Today 43,293 vs 44,314 yesterday.  The press keeps reporting total cases, and deaths, but not recoveries. https://t.co/hqYmqxV7ji', 1],
             ['Suicide is killing more people than Coronavirus. How about we all stock up on smiling at strangers instead?', 1],
             ['There are about 10,000 doctor vacancies and 43,000 nurse vacancies in the NHS.  This is not a problem that will be solved in the short term. The Government have left the population exposed to unnecessary risk. They have no one to blame but themselves.  https://t.co/VTyCaEMp3T', 1],
             ['NBA team doctors and trainers will have a conference call on Monday morning to discuss next steps with coronavirus, league sources tell ESPN.', 0],
             ['Not just anyone can pull off these bad boys, but I kinda feel  I can 😉 https://t.co/CsnBZxyPbj', 0],
             ['America’s Greatest President is trending.  Glad people are remembering and honoring President Obama.', 0],
             ['Fearful their tax cuts are in jeopardy, wealthy Koch donors plan to spend big to hold GOP Senate Majority in 2020 --  https://t.co/9bniHAjL34', 1],
             ['The reaction to the Joe Rogan endorsement is a perfect encapsulation of why progressives will probably lose this election.', 1],
             ['The nonchalance with which people Are like “corona only kills old people”makes me realize how much some of you truly hate your parents.',0],
             ['Corona virus is basically the flu. Is every flu season a pandemic?  There are literally millions and millions of active flu cases world wide right now.  It’s flu season for God sake. Isn’t the reaction just a little over the top?', 1],
             ["Birthday girl🥰❤️  @jeorella__ https://t.co/qtldCSJPfr", 0],
             ["Trump thinks COVID-19 is causing the stock market to drop.  It’s not.   What’s causing it to drop is that there’s a difficult situation (the virus) that needs to be dealt with and the person in charge has the impulse control of a 5-year-old.", 1],
             ["this is the BEST set of info I’ve found on corona virus and it’s worth watching the full 10mins https://t.co/oTNt5Cxg8j", 0],
             ['So Dr. Fauci not only admits honestly that we are failing on testing, but doing worse than other countries.  Ugh. Not the behavior usually associated with a "superpower." https://t.co/sVbeLXblLF', 1],
             ["Coronavirus? Who’s afraid? https://t.co/nNIxVwwGLB", 0],
             ["Sen. Dianne Feinstein Leans Toward Voting for Trump Acquittal https://t.co/Ty2KAbO16s", 0],
             ["Proud to be Somali. Proud to be African. Proud to be British. https://t.co/VY41Kz9y4j", 0],
             ["Get ready for even more surprise reveals, steamy romances, and heartwarming moments from #YR, because we've got big news! 🎉 The Young and the Restless has been renewed through 2024. Find out more here: https://t.co/Wo3BXnlXg2 https://t.co/TdWKhy1itI", 1],
             ]
pred_sentences = [cleanseText(text) for text,arg in test_data]
# pred_sentences

In [68]:
test_x_vectors = [get_bow_vector(text) for text in pred_sentences]

predictions = clf_svm.predict(test_x_vectors)

for i in range(len(pred_sentences)):
  test_data[i].append(predictions[i])

In [69]:
df = pd.DataFrame(test_data)

df.columns = ['Tweet', 'True Label', 'Predicted Label']
df

Unnamed: 0,Tweet,True Label,Predicted Label
0,"Trump, signing the $8.3B coronavirus bill w/Az...",0,1
1,Garfield from @StrayCatsWM has had his leg amp...,1,1
2,"Adrienne Posta is 71 today, Happy Birthday Adr...",0,0
3,So I feel really hot and sweaty and my heads t...,0,1
4,The Press Secretary says that what President T...,1,1
5,"David Jason is 80 today, Happy Birthday David ...",0,0
6,I know #Coronavirus is spreading outside of Ch...,1,1
7,Suicide is killing more people than Coronaviru...,1,0
8,"There are about 10,000 doctor vacancies and 43...",1,1
9,NBA team doctors and trainers will have a conf...,0,0


In [70]:
from sklearn.metrics import classification_report, accuracy_score

In [71]:
acc = accuracy_score(df['True Label'], df['Predicted Label'])

print(f'Accuracy --> {acc:1.2f}')

Accuracy --> 0.88
