In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

from skmultilearn.problem_transform import LabelPowerset
from sklearn.metrics import accuracy_score,hamming_loss,classification_report


In [21]:
#loading the training dataset
train_data = pd.read_csv('/Users/meimei/Documents/UT1/Data Analytics/sentiment-analysis-of-covid-19-related-tweets/training_T2.csv', header=0, delimiter=',')

#Show the top part of the data table
train_data.head()

Unnamed: 0,ID,Tweet,Labels,Labels_t1,Labels_t
0,1,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...,0 10,"optimistic','joking'","['optimistic','joking']"
1,2,BanMediaHouse whose is responsible for spreadi...,6,annoyed',['annoyed']
2,3,Im waiting for someone to say to me that all t...,3 4,"pessimistic','anxious'","['pessimistic','anxious']"
3,4,He is a liar. Proven day night. Time again. Li...,6,annoyed',['annoyed']
4,5,"NEW: U.S. CoronaVirus death toll reaches 4,000...",8,surprise',['surprise']


In [22]:
#train_data.info()

In [37]:
#Get the X variables
X_train = train_data.drop(["Labels","Labels_t1","Labels_t"], axis=1)
X_train.head()

Unnamed: 0,ID,Tweet
0,1,NO JOKE I WILL HOP ON A PLANE RN! (Well after ...
1,2,BanMediaHouse whose is responsible for spreadi...
2,3,Im waiting for someone to say to me that all t...
3,4,He is a liar. Proven day night. Time again. Li...
4,5,"NEW: U.S. CoronaVirus death toll reaches 4,000..."


In [23]:
print(type(train_data['Labels_t'].iloc[0]))
train_data['Labels_t'].iloc[0]

<class 'str'>


"['optimistic','joking']"

In [24]:
import ast
ast.literal_eval(train_data['Labels_t'].iloc[0])
train_data['Labels_t'] = train_data['Labels_t'].apply(lambda x: ast.literal_eval(x))

In [25]:
train_data['Labels_t']

0                                    [optimistic, joking]
1                                               [annoyed]
2                                  [pessimistic, anxious]
3                                               [annoyed]
4                                              [surprise]
                              ...                        
4995                               [anxious, sad, denial]
4996                                       [anxious, sad]
4997    [pessimistic, sad, official_report, thankfulop...
4998                [official_report, thankfuloptimistic]
4999                        [annoyed, thankfuloptimistic]
Name: Labels_t, Length: 5000, dtype: object

In [47]:
#Get the u variables
y = train_data["Labels_t"]
y.head()

0      [optimistic, joking]
1                 [annoyed]
2    [pessimistic, anxious]
3                 [annoyed]
4                [surprise]
Name: Labels_t, dtype: object

In [26]:
labels = train_data['Labels_t'].values
print(labels)

label_dic={}
for label_list in labels:
    for label in label_list:        
        if label not in label_dic:
            label_dic[label]=1
        else:
             label_dic[label]+=1
df = pd.DataFrame(list(label_dic.items()), columns=['label', 'count']).sort_values(by = 'count',axis = 0,ascending = False)
print('NumOfLabels:',len(df))
df.head(10)

[list(['optimistic', 'joking']) list(['annoyed'])
 list(['pessimistic', 'anxious']) ...
 list(['pessimistic', 'sad', 'official_report', 'thankfuloptimistic'])
 list(['official_report', 'thankfuloptimistic'])
 list(['annoyed', 'thankfuloptimistic'])]
NumOfLabels: 12


Unnamed: 0,label,count
1,joking,1777
2,annoyed,1725
0,optimistic,1180
6,sad,1088
8,official_report,914
4,anxious,842
3,pessimistic,620
5,surprise,604
9,thankfuloptimistic,480
7,denial,314


In [27]:
#transforming negation abbreviated text to standard text
re_negation = re.compile("n't ") #regular expression rule

def negation_abbreviation_to_regular(abbreviated_text):
    '''
    aren't -> are not 
    '''
    regular_text = re_negation.sub(' not ', abbreviated_text)    
    return regular_text

In [30]:
#get the stopwords list from the nltk library
stopwords_list = set(stopwords.words("english"))

In [31]:
#lemmatized the words
lemmatizer = WordNetLemmatizer()

def get_lemmatized_words(words_list):
    '''
        cats -> cat
        houses to house
        apples to apple
        started to start (v)
    '''    
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words_list]
    lemmatized_words = [lemmatizer.lemmatize(word, "v") for word in lemmatized_words]
    
    return lemmatized_words

In [32]:
def get_clean_review(raw_review):
    '''
        cleaning the review text
    '''
    #remove html tags from the raw review
    review_text = BeautifulSoup(raw_review).get_text()
    
    #transforming the negation abbreviated terms to regular terms
    review_text_regular = negation_abbreviation_to_regular(review_text)
    
    #removing non-alphanumeric terms
    review_text_alphanum = re.sub("[^a-zA-Z_0-9]", " ", review_text_regular)
    
    #converting the characters into lowercase
    review_text_lower_case = review_text_alphanum.lower()

    #tokenize the text into words
    review_words = review_text_lower_case.split()
    
    #removing stop words
    review_words_meaningful = [word for word in review_words if word not in stopwords_list]
    
    #lematization
    review_words_lemmatized = get_lemmatized_words(review_words_meaningful)
    
    clean_review = " ".join(review_words_lemmatized)
    return clean_review

In [38]:
#cleaning reviews
train_reviews_clean = []
for review in X_train['Tweet']:
    clean_review = get_clean_review(review)
    train_reviews_clean.append(clean_review)

In [39]:
from sklearn.feature_extraction.text import CountVectorizer

In [40]:
#vectorization the text data
#vectorizer = TfidfVectorizer(max_features=10000, ngram_range = (1,2))
vectorizer = CountVectorizer(max_features=10000, ngram_range = (1,2))

In [41]:
Xfeatures = vectorizer.fit_transform(train_reviews_clean)
Xfeatures.shape

(5000, 10000)

In [42]:
Xfeatures.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [48]:
#slip the data
X_train,X_test,Y_train,Y_test = train_test_split(Xfeatures,y,test_size=0.2,random_state =42)

In [49]:
print(X_train.shape)

(4000, 10000)


In [135]:
# Method 1: LabelPowerset

In [51]:
#build model
#covert the multi-label
mlt=MultiLabelBinarizer()
Y_train_convert= mlt.fit_transform(Y_train)
Y_test_convert = mlt.fit_transform(Y_test)
mlt.classes_

array(['annoyed', 'anxious', 'denial', 'empathetic', 'joking',
       'official_report', 'optimistic', 'pessimistic', 'sad', 'surprise',
       'thankful', 'thankfuloptimistic'], dtype=object)

In [52]:
Y_train_convert

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [56]:
Y_train1 = mlt.inverse_transform(Y_train_convert)
#print(Y_train1)

In [60]:
def build_model(model,mlb_estimator,xtrain,ytrain,xtest,ytest):
    # Create an Instance
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    # Predict
    clf_predictions = clf.predict(xtest)
    # Check For Accuracy
    acc = accuracy_score(ytest,clf_predictions)
    ham = hamming_loss(ytest,clf_predictions)
    result = {"accuracy:":acc,"hamming_score":ham}
    return result

In [63]:
clf_labelP_model = build_model(MultinomialNB(),LabelPowerset,X_train,Y_train_convert,X_test,Y_test_convert)

In [64]:
clf_labelP_model

{'accuracy:': 0.162, 'hamming_score': 0.19691666666666666}

In [1]:
modelLP = LabelPowerset(MultinomialNB())
modelLP.fit(X_train,Y_train_convert)

NameError: name 'LabelPowerset' is not defined

In [66]:
#processing the test dataset

#loading the testing dataset
test_data = pd.read_csv('/Users/meimei/Documents/UT1/Data Analytics/sentiment-analysis-of-covid-19-related-tweets/validation.csv', header=0, delimiter=',')

#Show the top part of the data table
test_data.head()

Unnamed: 0,ID,Tweet
0,5001,Forgot to a math test and I was failing but my...
1,5002,Corona effected came from Delhi in Bihar lakhi...
2,5003,Make CORONA END NA
3,5004,Imagine if the coronavirus pandemic was a big ...
4,5005,Howdy Ana. Where can I get married during the ...


In [67]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      2500 non-null   int64 
 1   Tweet   2500 non-null   object
dtypes: int64(1), object(1)
memory usage: 39.2+ KB


In [68]:
X_pre = test_data['Tweet']
X_pre[0]

'Forgot to a math test and I was failing but my teacher opening it back up, maybe corona aint so bad'

In [69]:
#cleaning test reviews
test_reviews_clean = []
for review in X_pre:
    clean_review = get_clean_review(review)
    test_reviews_clean.append(clean_review)

In [70]:
#vectorize the test reviews
X_pre_features = vectorizer.fit_transform(test_reviews_clean)

In [71]:
#prediction
#use first model to predict
clf_predictions = modelLP.predict(X_pre_features)
Y_pred = mlt.inverse_transform(clf_predictions)

In [75]:
test_pred_convert = mlt.inverse_transform(Y_pred)

AttributeError: 'list' object has no attribute 'shape'

In [78]:
test_output = pd.DataFrame(data={"ID":test_data['ID'], "Labels":Y_pred})
print(test_output.head())

test_output.to_csv('/Users/meimei/Documents/UT1/Data Analytics/sentiment-analysis-of-covid-19-related-tweets/prediction2.csv', index=False)

     ID                 Labels
0  5001      (annoyed, joking)
1  5002  (thankfuloptimistic,)
2  5003   (joking, optimistic)
3  5004  (joking, pessimistic)
4  5005  (thankfuloptimistic,)
