In [1]:
import pandas as pd
import nltk
import string
import re

### READING TRAIN DATASET

In [2]:
pd.set_option('display.width', 10000)
pd.set_option('display.max_colwidth', 10000)
df = pd.read_csv("irmidis-2021-task2-train.csv")
df.head()

Unnamed: 0,id,tweet,label
0,1325682517148569600,Coronavirus: Some Canadians hesitant to take a COVID-19 vaccine – Global News https://t.co/pOgrnxWbGL,AntiVax
1,1325684180483600384,"Moderna on track to report late-stage COVID-19 vaccine data next month: The vaccine is currently being tested in a large human trial, and Moderna was in talks with a WHO-led group for distribution. https://t.co/CJGuw7rDzj JPost https://t.co/QPAhjvDm76",Neutral
2,1325689685943996416,The Philippine government is in talks with Anglo-Swedish drugmaker AstraZeneca to procure doses of its prospective COVID-19 vaccine. | @DYGalvezINQ https://t.co/dVTpgOZT9h,Neutral
3,1325690517724782593,Care homes to be FIRST to receive coronavirus vaccine that could roll out before Christmas https://t.co/V71edGrw7F,Neutral
4,1325697646909132800,This is why thinking that a vaccine is a panacea is a dangerous idea.\n\nYou can outsmart evolution at your own peril. https://t.co/WJXen4MfTV https://t.co/3yx5Bf9hTh,AntiVax


### Text cleaning

In [3]:
import neattext.functions as nfx

In [4]:
df['tweet'] = df['tweet'].apply(nfx.remove_multiple_spaces)
df['tweet'] = df['tweet'].apply(nfx.remove_hashtags)
df['tweet'] = df['tweet'].apply(nfx.remove_userhandles)
df['tweet'] = df['tweet'].apply(nfx.remove_emails)
df['tweet'] = df['tweet'].apply(nfx.remove_emojis)
df['tweet'] = df['tweet'].apply(nfx.remove_urls)
df['tweet'] = df['tweet'].apply(nfx.remove_special_characters)
df['tweet'] = df['tweet'].apply(nfx.remove_punctuations)
df['tweet'] = df['tweet'].apply(nfx.remove_stopwords)

In [5]:
df.head(10)

Unnamed: 0,id,tweet,label
0,1325682517148569600,Coronavirus Canadians hesitant COVID19 vaccine GlobalNews,AntiVax
1,1325684180483600384,Moderna track report latestage COVID19 vaccine data month vaccine currently tested large human trial Moderna talks WHOled group distribution JPost,Neutral
2,1325689685943996416,Philippine government talks AngloSwedish drugmaker AstraZeneca procure doses prospective COVID19 vaccine,Neutral
3,1325690517724782593,Care homes receive coronavirus vaccine roll beforeChristmas,Neutral
4,1325697646909132800,thinking vaccine panacea dangerous idea outsmart evolution peril,AntiVax
5,1325699765930254336,Hospitals reportedly prepped possible coronavirus vaccine roll weeks says optimistic think people vaccinated Christmas good news Watch GMB latesthttpstcofzcHkA6S4k,Neutral
6,1325720167867965440,Zambia talks Russia accessing Sputnik V coronavirus vaccine,Neutral
7,1325768441370800128,Pfizer says coronavirus vaccine 90 percent effective compared placebo major effects Study involved 44000 people,Neutral
8,1325770677580918785,QampA COVID19 vaccinerace,Neutral
9,1325770986571096064,good news Covid19 vaccine candidate 90 effective says manufacturer,ProVax


### TOKENIZATION

In [6]:
tokenizer = nltk.tokenize.TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)

def tokenize_text(text):
    tokens = tokenizer.tokenize(text)
    return tokens

df['tweet'] = df['tweet'].apply(lambda x: tokenize_text(x.lower()))

### LEMMATIZATION

In [7]:
from nltk.stem import WordNetLemmatizer
wn = WordNetLemmatizer()

def lemm(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

df['tweet'] = df['tweet'].apply(lambda x: lemm(x))
df.head(10)


df['tweet'] = [' '.join(map(str, l)) for l in df['tweet']]

df.head()

Unnamed: 0,id,tweet,label
0,1325682517148569600,coronavirus canadian hesitant covid 19 vaccine globalnews,AntiVax
1,1325684180483600384,moderna track report latestage covid 19 vaccine data month vaccine currently tested large human trial moderna talk wholed group distribution jpost,Neutral
2,1325689685943996416,philippine government talk angloswedish drugmaker astrazeneca procure dos prospective covid 19 vaccine,Neutral
3,1325690517724782593,care home receive coronavirus vaccine roll beforechristmas,Neutral
4,1325697646909132800,thinking vaccine panacea dangerous idea outsmart evolution peril,AntiVax


### TFIDF VECTORIZATION

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['tweet'])

In [9]:
df['encoded_label'] = df['label'].map({'ProVax' : 0,
                                      'AntiVax' : 1,
                                      'Neutral' : 2})

### SEPARATING DEPENDENT AND INDEPENDANT VARIABLES

In [10]:
X = vectors.toarray()  #independent
y = df.iloc[:, -1].values  #dependent

### Training the Naive Bayes model on the training dataset

In [11]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X, y)

MultinomialNB()

### TESTING DATASET

In [12]:
df2 = pd.read_csv("irmidis-2021-task2-test.csv")

### CLEAING OF TEST DATASET

In [13]:
df2['tweet'] = df2['tweet'].apply(nfx.remove_multiple_spaces)
df2['tweet'] = df2['tweet'].apply(nfx.remove_hashtags)
df2['tweet'] = df2['tweet'].apply(nfx.remove_userhandles)
df2['tweet'] = df2['tweet'].apply(nfx.remove_emails)
df2['tweet'] = df2['tweet'].apply(nfx.remove_emojis)
df2['tweet'] = df2['tweet'].apply(nfx.remove_urls)
df2['tweet'] = df2['tweet'].apply(nfx.remove_special_characters)
df2['tweet'] = df2['tweet'].apply(nfx.remove_punctuations)
df2['tweet'] = df2['tweet'].apply(nfx.remove_stopwords)

### TOKENIZATION, LEMMATIZATION & VECTORIZATION (TEST)

In [14]:
df2['tweet'] = df2['tweet'].apply(lambda x: tokenize_text(x.lower()))
df2['tweet'] = df2['tweet'].apply(lambda x: lemm(x))
df2['tweet'] = [' '.join(map(str, l)) for l in df2['tweet']]

In [15]:
vectors2 = vectorizer.transform(df2['tweet'])

In [16]:
X_test = vectors2.toarray()

In [17]:
df2.head()

Unnamed: 0,id,tweet
0,1259097719341297664,poll million american refuse covid 19 vaccine world hopefully
1,1307596484251062273,right remember time line getting effective vaccine longer think expected covid 19 course rna virus rapid mutation likely mean incomplete protection year year
2,1296241188823953409,got key new house month covid lockdown construction delayed im hoping vaccine mass produce soon
3,1294639481216045056,cdc july 2020 acip meeting overview covid 19 vaccine clinical trial
4,1319741954721239041,notably absent vaccine nightmare facing covid 19 caused 225000 dead american show trump focus pathetic


### Predicting the results

In [18]:
y_pred = classifier.predict(X_test)

#### ADDING A COL IN TEST DATASET WITH PREDICTED ENCODED_LABELS

In [19]:
df2['encoded_label'] = y_pred

In [20]:
df2.head()

Unnamed: 0,id,tweet,encoded_label
0,1259097719341297664,poll million american refuse covid 19 vaccine world hopefully,0
1,1307596484251062273,right remember time line getting effective vaccine longer think expected covid 19 course rna virus rapid mutation likely mean incomplete protection year year,0
2,1296241188823953409,got key new house month covid lockdown construction delayed im hoping vaccine mass produce soon,0
3,1294639481216045056,cdc july 2020 acip meeting overview covid 19 vaccine clinical trial,2
4,1319741954721239041,notably absent vaccine nightmare facing covid 19 caused 225000 dead american show trump focus pathetic,2


#### ADDING ANOTHER COLUMS WITH DECODED LABEL NAMES

In [21]:
df2['pred'] = df2['encoded_label'].map({0 : 'ProVax',
                                        1 : 'AntiVax',
                                        2 : 'Neutral'})

In [22]:
df2.head(100)

Unnamed: 0,id,tweet,encoded_label,pred
0,1259097719341297664,poll million american refuse covid 19 vaccine world hopefully,0,ProVax
1,1307596484251062273,right remember time line getting effective vaccine longer think expected covid 19 course rna virus rapid mutation likely mean incomplete protection year year,0,ProVax
2,1296241188823953409,got key new house month covid lockdown construction delayed im hoping vaccine mass produce soon,0,ProVax
3,1294639481216045056,cdc july 2020 acip meeting overview covid 19 vaccine clinical trial,2,Neutral
4,1319741954721239041,notably absent vaccine nightmare facing covid 19 caused 225000 dead american show trump focus pathetic,2,Neutral
...,...,...,...,...
95,1338630220472061953,im 100 getting covid vaccine soon able 2020 hard shake thought kind shit legend start,0,ProVax
96,1327612772239208453,postdose safety information trial data time passed ill pas thanks freedom choice equate selfishness,1,AntiVax
97,1334055767108972545,confirmed hamilton seat pfizer announcing vaccine approval uk,2,Neutral
98,1336160175217106949,distributing covid vaccine nursing home theyre old death wont affect economy vaccine harm,2,Neutral


In [23]:
df2 = df2.iloc[:, [0, 3]]
df2.head()

Unnamed: 0,id,pred
0,1259097719341297664,ProVax
1,1307596484251062273,ProVax
2,1296241188823953409,ProVax
3,1294639481216045056,Neutral
4,1319741954721239041,Neutral


### CONVERTING THE FINAL DATAFRAME TO CSV FILE WITH PREDICTED LABELS

In [24]:
df2.to_csv("irmidis-2021-task2-predictions-3.csv", index = False)