# Importing the Libraries

In [1]:
import pandas as pd
import os
from textblob import TextBlob
import string
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
import re

In [2]:
DATA_DIR = os.path.join(os.getcwd(),"dataset")
print(os.listdir(DATA_DIR))
TRAIN_DATA = os.path.join(DATA_DIR,"train.csv")
TEST_DATA = os.path.join(DATA_DIR,"test.csv")

['train.csv', 'train_preprocessed.csv', 'test.csv']


## Extrapolatory Data Analysis

In [3]:
data_train = pd.read_csv(TRAIN_DATA)
data_train.head()

Unnamed: 0,id,original_text,lang,retweet_count,original_author,sentiment_class
0,1.245025e+18,Happy #MothersDay to all you amazing mothers o...,en,0,BeenXXPired,0
1,1.245759e+18,Happy Mothers Day Mum - I'm sorry I can't be t...,en,1,FestiveFeeling,0
2,1.246087e+18,Happy mothers day To all This doing a mothers ...,en,0,KrisAllenSak,-1
3,1.244803e+18,Happy mothers day to this beautiful woman...ro...,en,0,Queenuchee,0
4,1.244876e+18,Remembering the 3 most amazing ladies who made...,en,0,brittan17446794,-1


In [4]:
print("Shape of the training data",data_train.shape)

Shape of the training data (3235, 6)


In [5]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3235 entries, 0 to 3234
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               3235 non-null   float64
 1   original_text    3235 non-null   object 
 2   lang             3231 non-null   object 
 3   retweet_count    3231 non-null   object 
 4   original_author  3235 non-null   object 
 5   sentiment_class  3235 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 151.8+ KB


In [6]:
data_train['lang'].value_counts()

en                            2994
 pink Peruvian opal! via         4
 Find More                       2
WORLDS OKAYEST MOTHER! &lt       2
&gt                              2
                              ... 
-0.0320226838                    1
-0.9022044897                    1
 here's to !                     1
 ️                               1
-0.948781497                     1
Name: lang, Length: 232, dtype: int64

In [7]:
print(len(data_train['lang'].value_counts())) #there are 232 local languages
print(data_train['sentiment_class'].value_counts())

232
 0    1701
-1     769
 1     765
Name: sentiment_class, dtype: int64


# Data Preprocessing

In [8]:
def preprocess_tweet(text):

    # Check characters to see if they are in punctuation
    nopunc = [char for char in text if char not in string.punctuation]
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    # convert text to lower-case
    nopunc = nopunc.lower()
    # remove URLs
    nopunc = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', nopunc)
    nopunc = re.sub(r'http\S+', '', nopunc)
    # remove usernames
    nopunc = re.sub('@[^\s]+', '', nopunc)
    # remove the # in #hashtag
    nopunc = re.sub(r'#([^\s]+)', r'\1', nopunc)
    nopunc = re.sub(r'\w*\d\w*', '', nopunc).strip()
    # remove repeated characters
    nopunc = word_tokenize(nopunc)
    # remove stopwords from final word list
    return " ".join([str(word) for word in nopunc if word not in stopwords.words('english')])

In [9]:
data_train['original_text'] = data_train['original_text'].apply(lambda x:preprocess_tweet(x))

# Preparing the data for Training

In [10]:
def find_sentiment(text):
    blob = TextBlob(text)
    sentiment_score = blob.sentiment[0]
    if sentiment_score>0:
        return 1
    elif sentiment_score<0:
        return -1
    else:
        return 0


In [11]:
data_train.head()

Unnamed: 0,id,original_text,lang,retweet_count,original_author,sentiment_class
0,1.245025e+18,happy mothersday amazing mothers know hard abl...,en,0,BeenXXPired,0
1,1.245759e+18,happy mothers day mum im sorry cant bring moth...,en,1,FestiveFeeling,0
2,1.246087e+18,happy mothers day mothers days work today quie...,en,0,KrisAllenSak,-1
3,1.244803e+18,happy mothers day beautiful womanroyalty sooth...,en,0,Queenuchee,0
4,1.244876e+18,remembering amazing ladies made late grandmoth...,en,0,brittan17446794,-1


In [12]:
data_train.to_csv("train_preprocessed.csv",index=False)
data_train['sentiment'] = data_train['original_text'].apply(lambda x:find_sentiment(x))

# Using TextBlob to find Sentiment(Basline Model)

In [13]:
count= 0 
for i in range(data_train.shape[0]):
    if data_train.loc[i,"sentiment_class"]==data_train.loc[i,"sentiment"]:
        count+=1
print("Number of correct entries found on labelled data",count) 
data_train.head()

Number of correct entries found on labelled data 767


Unnamed: 0,id,original_text,lang,retweet_count,original_author,sentiment_class,sentiment
0,1.245025e+18,happy mothersday amazing mothers know hard abl...,en,0,BeenXXPired,0,1
1,1.245759e+18,happy mothers day mum im sorry cant bring moth...,en,1,FestiveFeeling,0,1
2,1.246087e+18,happy mothers day mothers days work today quie...,en,0,KrisAllenSak,-1,1
3,1.244803e+18,happy mothers day beautiful womanroyalty sooth...,en,0,Queenuchee,0,1
4,1.244876e+18,remembering amazing ladies made late grandmoth...,en,0,brittan17446794,-1,1


# Creating the Submission File

In [14]:
data_test = pd.read_csv(TEST_DATA)
data_test.head()

Unnamed: 0,id,original_text,lang,retweet_count,original_author
0,1.246628e+18,"3. Yeah, I once cooked potatoes when I was 3 y...",en,0,LToddWood
1,1.245898e+18,"Happy Mother's Day to all the mums, step-mums,...",en,0,iiarushii
2,1.244717e+18,"I love the people from the UK, however, when I...",en,0,andreaanderegg
3,1.24573e+18,Happy 81st Birthday Happy Mother’s Day to my m...,en,1,TheBookTweeters
4,1.244636e+18,Happy Mothers day to all those wonderful mothe...,en,0,andreaanderegg


In [15]:
print("Shape of the test data",data_test.shape)
data_test['original_text'] = data_test['original_text'].apply(lambda x:preprocess_tweet(x))
data_test['sentiment_class'] = data_test['original_text'].apply(lambda x:find_sentiment(x))
data_test_submission = data_test.drop(['original_text','lang','retweet_count','original_author'],axis=1)

Shape of the test data (1387, 5)


In [16]:
data_test_submission.to_csv("submission.csv",index=False)