In [1]:
#importing required packages
import sys
import os
import numpy as np
import re #regular expression
import nltk
import pandas as pd #pandas data frame for holding the data
from bs4 import BeautifulSoup #for cleaning the html boilerplate
from nltk.corpus import stopwords #import the stopword list from nltk library
from nltk.stem import WordNetLemmatizer #import the lemmatizer from nltk

from sklearn.feature_extraction.text import TfidfVectorizer #import tfidf vectorizer from scikit-learn

from sklearn.linear_model import LogisticRegression #import the logistic regression model from scikit-learn

from sklearn.metrics import confusion_matrix #import the confusion matrix
from sklearn.metrics import classification_report #import the classification report to compute recall, prec, etc


In [2]:
#loading the training dataset
train_data = pd.read_csv('/Users/meimei/Documents/UT1/Data Analytics/sentiment-analysis-of-covid-19-related-tweets/training.csv')
test_data = pd.read_csv('/Users/meimei/Documents/UT1/Data Analytics/sentiment-analysis-of-covid-19-related-tweets/validation.csv')
train_data.head()

Unnamed: 0,ID,Tweet,Labels
0,1,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...,0 10
1,2,BanMediaHouse whose is responsible for spreadi...,6
2,3,Im waiting for someone to say to me that all t...,3 4
3,4,He is a liar. Proven day night. Time again. Li...,6
4,5,"NEW: U.S. CoronaVirus death toll reaches 4,000...",8


In [3]:
test_data.head()

Unnamed: 0,ID,Tweet
0,5001,Forgot to a math test and I was failing but my...
1,5002,Corona effected came from Delhi in Bihar lakhi...
2,5003,Make CORONA END NA
3,5004,Imagine if the coronavirus pandemic was a big ...
4,5005,Howdy Ana. Where can I get married during the ...


In [4]:
#Get the basic information about the data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      5000 non-null   int64 
 1   Tweet   5000 non-null   object
 2   Labels  5000 non-null   object
dtypes: int64(1), object(2)
memory usage: 117.3+ KB


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      2500 non-null   int64 
 1   Tweet   2500 non-null   object
dtypes: int64(1), object(1)
memory usage: 39.2+ KB


In [6]:
# split the labels
def labelsplit(labels) :    
    label = labels.replace(' ',',').split(',')
    for idx, lab in enumerate(label) :
        label[idx] = int(lab)
    return label

train_data['Label']  = train_data['Labels'].apply(lambda x :  labelsplit(x))
train_data.head()


Unnamed: 0,ID,Tweet,Labels,Label
0,1,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...,0 10,"[0, 10]"
1,2,BanMediaHouse whose is responsible for spreadi...,6,[6]
2,3,Im waiting for someone to say to me that all t...,3 4,"[3, 4]"
3,4,He is a liar. Proven day night. Time again. Li...,6,[6]
4,5,"NEW: U.S. CoronaVirus death toll reaches 4,000...",8,[8]


In [7]:
#Get the u variables
y_train = train_data["Label"]
# y_train.head()
print(y_train[:5])
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
print(y_train[:5])

0    [0, 10]
1        [6]
2     [3, 4]
3        [6]
4        [8]
Name: Label, dtype: object
[[1 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 1 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0]]


In [8]:
#transforming negation abbreviated text to standard text
re_negation = re.compile("n't ") #regular expression rule

def negation_abbreviation_to_regular(abbreviated_text):
    '''
    aren't -> are not 
    '''
    regular_text = re_negation.sub(' not ', abbreviated_text)    
    return regular_text

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/meimei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
#get the stopwords list from the nltk library
stopwords_list = set(stopwords.words("english"))

In [11]:
#lemmatized the words
lemmatizer = WordNetLemmatizer()

def get_lemmatized_words(words_list):
    '''
        cats -> cat
        houses to house
        apples to apple
        started to start (v)
    '''    
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words_list]
    lemmatized_words = [lemmatizer.lemmatize(word, "v") for word in lemmatized_words]
    
    return lemmatized_words

In [12]:
def get_clean_review(raw_review):
    '''
        cleaning the review text
    '''
    #remove html tags from the raw review
    review_text = BeautifulSoup(raw_review).get_text()
    
    #transforming the negation abbreviated terms to regular terms
    review_text_regular = negation_abbreviation_to_regular(review_text)
    
    #removing non-alphanumeric terms
    review_text_alphanum = re.sub("[^a-zA-Z_0-9]", " ", review_text_regular)
    
    #converting the characters into lowercase
    review_text_lower_case = review_text_alphanum.lower()

    #tokenize the text into words
    review_words = review_text_lower_case.split()
    
    #removing stop words
    review_words_meaningful = [word for word in review_words if word not in stopwords_list]
    
    #lematization
    review_words_lemmatized = get_lemmatized_words(review_words_meaningful)
    
    clean_review = " ".join(review_words_lemmatized)
    return clean_review

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/meimei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
#cleaning reviews
train_reviews_clean = []
for review in train_data['Tweet']:
    clean_review = get_clean_review(review)
    train_reviews_clean.append(clean_review)
print(train_reviews_clean[:5])

['joke hop plane rn well covid19 lol', 'banmediahouse whose responsible spread fake communal story pandemic corona situation', 'im wait someone say corona thing april fool joke', 'liar prove day night time lie truth covid19', 'new u coronavirus death toll reach 4 000 nearly 900 new death report today bno news covid19 coronavirusoutbreak']


In [15]:
train_data_clean = pd.DataFrame({'text': train_reviews_clean, 'labels': list(y_train)})
train_data_clean.head()


Unnamed: 0,text,labels
0,joke hop plane rn well covid19 lol,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
1,banmediahouse whose responsible spread fake co...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
2,im wait someone say corona thing april fool joke,"[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]"
3,liar prove day night time lie truth covid19,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
4,new u coronavirus death toll reach 4 000 nearl...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"


In [21]:
from simpletransformers.classification import MultiLabelClassificationModel
model = MultiLabelClassificationModel('bert', 'bert-base-uncased', num_labels=11, use_cuda=False, args={'reprocess_input_data': True, 'num_train_epochs': 10, 'evaluate_during_training_verbose': True})
model.train_model(train_data_clean)


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultiLabelSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultiLabelSequenceClassification were not 

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  labels = torch.tensor(labels, dtype=torch.long)


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/625 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/625 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/625 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/625 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/625 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/625 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/625 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/625 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/625 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/625 [00:00<?, ?it/s]

(6250, 0.1815347477386892)

In [22]:
test_data.head()

Unnamed: 0,ID,Tweet
0,5001,Forgot to a math test and I was failing but my...
1,5002,Corona effected came from Delhi in Bihar lakhi...
2,5003,Make CORONA END NA
3,5004,Imagine if the coronavirus pandemic was a big ...
4,5005,Howdy Ana. Where can I get married during the ...


In [23]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      2500 non-null   int64 
 1   Tweet   2500 non-null   object
dtypes: int64(1), object(1)
memory usage: 39.2+ KB


In [24]:
X_test = test_data['Tweet']
X_test[0]

'Forgot to a math test and I was failing but my teacher opening it back up, maybe corona aint so bad'

In [25]:
#cleaning test reviews
test_reviews_clean = []
for review in X_test:
    clean_review = get_clean_review(review)
    test_reviews_clean.append(clean_review)
print(test_reviews_clean[:5])

['forget math test fail teacher open back maybe corona aint bad', 'corona effect come delhi bihar lakhisarai district mahisona village name pappu khan', 'make corona end na', 'imagine coronavirus pandemic big april fool joke government pull u', 'howdy ana get marry coronavirus update gov']


In [26]:
test_clean_df = pd.DataFrame(test_reviews_clean)

In [27]:
test_clean_df.head()

Unnamed: 0,0
0,forget math test fail teacher open back maybe ...
1,corona effect come delhi bihar lakhisarai dist...
2,make corona end na
3,imagine coronavirus pandemic big april fool jo...
4,howdy ana get marry coronavirus update gov


In [28]:
predictions, raw_outputs = model.predict(test_reviews_clean)

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

In [29]:
test_data['prediction'] = predictions

In [30]:
print(test_data[:10])

     ID                                              Tweet  \
0  5001  Forgot to a math test and I was failing but my...   
1  5002  Corona effected came from Delhi in Bihar lakhi...   
2  5003                                 Make CORONA END NA   
3  5004  Imagine if the coronavirus pandemic was a big ...   
4  5005  Howdy Ana. Where can I get married during the ...   
5  5006  Coronavirus live blog: Allergist and immunolog...   
6  5007  We won't b talking about corona but about hung...   
7  5008  Fuck Corona man... I couldve gone on a vacatio...   
8  5009  Female-founded companies, as a result, have le...   
9  5010  anyone gonna come out and say Coronavirus was ...   

                          prediction  
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]  
2  [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0]  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]  
4  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]  
5  [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0]  
6  [0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0]  
7  [0

In [31]:
result = pd.DataFrame()
result[['0','1','2','3','4','5','6','7','8','9','10']] = pd.DataFrame(
    test_data['prediction'].tolist(), index = test_clean_df.index
)

In [32]:
result

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,1,1,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
2495,1,0,0,0,0,0,0,0,0,1,1
2496,0,0,0,0,0,0,1,0,1,0,0
2497,1,0,1,0,0,0,0,0,0,0,0
2498,0,0,0,0,0,0,0,0,0,0,1


In [33]:
cols_target = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
result['result'] = 0

In [34]:
for row in range(2500) :
    one_row = []
    for cat in cols_target :
        if int(result[row:row+1][cat].values) == 1 :
            one_row.append(cat)
    result.result[row] = ' '.join(one_row)
Final = result[['result']]
Final.head()

Unnamed: 0,result
0,0 10
1,9
2,3 4 5
3,10
4,0 10


In [35]:
submission = pd.concat([test_data, Final],axis=1).drop(['Tweet','prediction'], axis=1)
submission.head()

Unnamed: 0,ID,result
0,5001,0 10
1,5002,9
2,5003,3 4 5
3,5004,10
4,5005,0 10


In [36]:
submission.to_csv('/Users/meimei/Documents/UT1/Data Analytics/sentiment-analysis-of-covid-19-related-tweets/prediction_simpletransformers.csv', index = False)