In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

from skmultilearn.problem_transform import LabelPowerset
from sklearn.metrics import accuracy_score,hamming_loss,classification_report


In [2]:
#loading the training dataset
train_data = pd.read_csv('/Users/meimei/Documents/UT1/Data Analytics/sentiment-analysis-of-covid-19-related-tweets/training.csv', header=0, delimiter=',')

#Show the top part of the data table
train_data.head()

Unnamed: 0,ID,Tweet,Labels
0,1,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...,0 10
1,2,BanMediaHouse whose is responsible for spreadi...,6
2,3,Im waiting for someone to say to me that all t...,3 4
3,4,He is a liar. Proven day night. Time again. Li...,6
4,5,"NEW: U.S. CoronaVirus death toll reaches 4,000...",8


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      5000 non-null   int64 
 1   Tweet   5000 non-null   object
 2   Labels  5000 non-null   object
dtypes: int64(1), object(2)
memory usage: 117.3+ KB


In [4]:
#Get the X variables
X_train = train_data.drop(["Labels"], axis=1)
X_train.head()

Unnamed: 0,ID,Tweet
0,1,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...
1,2,BanMediaHouse whose is responsible for spreadi...
2,3,Im waiting for someone to say to me that all t...
3,4,He is a liar. Proven day night. Time again. Li...
4,5,"NEW: U.S. CoronaVirus death toll reaches 4,000..."


In [5]:
# split the labels
def labelsplit(labels) :    
    label = labels.replace(' ',',').split(',')
    for idx, lab in enumerate(label) :
        label[idx] = int(lab)
    return label

train_data['Label']  = train_data['Labels'].apply(lambda x :  labelsplit(x))
train_data.head()

Unnamed: 0,ID,Tweet,Labels,Label
0,1,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...,0 10,"[0, 10]"
1,2,BanMediaHouse whose is responsible for spreadi...,6,[6]
2,3,Im waiting for someone to say to me that all t...,3 4,"[3, 4]"
3,4,He is a liar. Proven day night. Time again. Li...,6,[6]
4,5,"NEW: U.S. CoronaVirus death toll reaches 4,000...",8,[8]


In [6]:
#Get the u variables
y = train_data["Label"]
y.head()

0    [0, 10]
1        [6]
2     [3, 4]
3        [6]
4        [8]
Name: Label, dtype: object

In [7]:
labels = train_data['Label'].values
print(labels)

label_dic={}
for label_list in labels:
    for label in label_list:        
        if label not in label_dic:
            label_dic[label]=1
        else:
             label_dic[label]+=1
df = pd.DataFrame(list(label_dic.items()), columns=['label', 'count']).sort_values(by = 'count',axis = 0,ascending = False)
print('NumOfLabels:',len(df))
df.head(10)

[list([0, 10]) list([6]) list([3, 4]) ... list([3, 5, 9, 10])
 list([9, 10]) list([6, 10])]
NumOfLabels: 11


Unnamed: 0,label,count
1,10,2257
2,6,1725
0,0,1180
6,5,1088
8,9,914
4,4,842
3,3,620
5,8,604
7,7,314
9,1,244


In [8]:
#transforming negation abbreviated text to standard text
re_negation = re.compile("n't ") #regular expression rule

def negation_abbreviation_to_regular(abbreviated_text):
    '''
    aren't -> are not 
    '''
    regular_text = re_negation.sub(' not ', abbreviated_text)    
    return regular_text

In [9]:
#get the stopwords list from the nltk library
stopwords_list = set(stopwords.words("english"))

In [10]:
#lemmatized the words
lemmatizer = WordNetLemmatizer()

def get_lemmatized_words(words_list):
    '''
        cats -> cat
        houses to house
        apples to apple
        started to start (v)
    '''    
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words_list]
    lemmatized_words = [lemmatizer.lemmatize(word, "v") for word in lemmatized_words]
    
    return lemmatized_words

In [11]:
def get_clean_review(raw_review):
    '''
        cleaning the review text
    '''
    #remove html tags from the raw review
    review_text = BeautifulSoup(raw_review).get_text()
    
    #transforming the negation abbreviated terms to regular terms
    review_text_regular = negation_abbreviation_to_regular(review_text)
    
    #removing non-alphanumeric terms
    review_text_alphanum = re.sub("[^a-zA-Z_0-9]", " ", review_text_regular)
    
    #converting the characters into lowercase
    review_text_lower_case = review_text_alphanum.lower()

    #tokenize the text into words
    review_words = review_text_lower_case.split()
    
    #removing stop words
    review_words_meaningful = [word for word in review_words if word not in stopwords_list]
    
    #lematization
    review_words_lemmatized = get_lemmatized_words(review_words_meaningful)
    
    clean_review = " ".join(review_words_lemmatized)
    return clean_review

In [12]:
#cleaning reviews
train_reviews_clean = []
for review in X_train['Tweet']:
    clean_review = get_clean_review(review)
    train_reviews_clean.append(clean_review)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
#vectorization the text data
#vectorizer = TfidfVectorizer(max_features=10000, ngram_range = (1,2))
vectorizer = CountVectorizer(max_features=10000, ngram_range = (1,2))

In [44]:
Xfeatures = vectorizer.fit_transform(train_reviews_clean)
Xfeatures.shape

(5000, 10000)

In [45]:
Xfeatures.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [46]:
#slip the data
X_train,X_test,Y_train,Y_test = train_test_split(Xfeatures,y,test_size=0.2,random_state =42)

In [47]:
print(X_train.shape)

(4000, 10000)


In [48]:
#build model
#covert the multi-label
mlt=MultiLabelBinarizer()
Y_train_convert= mlt.fit_transform(Y_train)
Y_test_convert = mlt.fit_transform(Y_test)
mlt.classes_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [49]:
Y_train_convert

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 1],
       ...,
       [1, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0]])

In [50]:
#Binary Relevance classficiation
from skmultilearn.problem_transform import BinaryRelevance

In [51]:
# Convert Our Multi-Label Prob to Multi-Class
# binary classficiation
modelBRC = BinaryRelevance(MultinomialNB())

In [52]:
modelBRC.fit(X_train,Y_train_convert)

BinaryRelevance(classifier=MultinomialNB(), require_dense=[True, True])

In [53]:
# Predictions
br_prediction = modelBRC.predict(X_test)

In [54]:
br_prediction.toarray()

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [55]:
accuracy_score(Y_test_convert,br_prediction)

0.144

In [30]:
# Hamming Loss :Incorrect Predictions
# The Lower the result the better
hamming_loss(Y_test_convert,br_prediction)

0.18481818181818183

In [31]:
#processing the test dataset

#loading the testing dataset
test_data = pd.read_csv('/Users/meimei/Documents/UT1/Data Analytics/sentiment-analysis-of-covid-19-related-tweets/validation.csv', header=0, delimiter=',')

#Show the top part of the data table
test_data.head()

Unnamed: 0,ID,Tweet
0,5001,Forgot to a math test and I was failing but my...
1,5002,Corona effected came from Delhi in Bihar lakhi...
2,5003,Make CORONA END NA
3,5004,Imagine if the coronavirus pandemic was a big ...
4,5005,Howdy Ana. Where can I get married during the ...


In [37]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      2500 non-null   int64 
 1   Tweet   2500 non-null   object
dtypes: int64(1), object(1)
memory usage: 39.2+ KB


In [38]:
X_pre = test_data['Tweet']
X_pre[0]

'Forgot to a math test and I was failing but my teacher opening it back up, maybe corona aint so bad'

In [39]:
#cleaning test reviews
test_reviews_clean = []
for review in X_pre:
    clean_review = get_clean_review(review)
    test_reviews_clean.append(clean_review)

In [40]:
#vectorize the test reviews
X_pre_features = vectorizer.fit_transform(test_reviews_clean)

In [41]:
#prediction
#use first model to predict
brc_predictions = modelBRC.predict(X_pre_features)
Y_pred = mlt.inverse_transform(brc_predictions)

In [42]:
test_output = pd.DataFrame(data={"ID":test_data['ID'], "Labels":Y_pred})
print(test_output.head())

test_output.to_csv('/Users/meimei/Documents/UT1/Data Analytics/sentiment-analysis-of-covid-19-related-tweets/prediction_BRC.csv', index=False)


     ID                           Labels
0  5001      (0, 1, 2, 3, 4, 5, 6, 7, 9)
1  5002     (0, 1, 2, 3, 6, 7, 8, 9, 10)
2  5003                       (6, 9, 10)
3  5004  (0, 1, 2, 3, 4, 5, 7, 8, 9, 10)
4  5005                           (5, 9)
