In [1]:
# import Libraries
import pandas as pd
import numpy as np
import re
import pickle
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [6]:
df = pd.read_excel('/content/Dataset (2).xlsx',index_col=None)

# Removing the unnecessary columns.
df = df[['Comment','polarity']]

In [13]:
df.columns

Index(['Comment', 'polarity'], dtype='object')

In [14]:
df.isna().sum()
df.dropna(inplace=True)

In [15]:
df.dropna()

Unnamed: 0,Comment,polarity
0,@ProFootballTalk Maybe we should stop keeping ...,2.0
1,So good I had to share! Check out all the item...,5.0
2,I’m New on Twitter do follow friends ❤️🥺 #foll...,3.0
3,Your favourite daily newspaper just added even...,3.0
4,@imKayrapolat @tsss67 @PurelyFootball That's b...,2.0
...,...,...
17995,Call me old fashion but I want a simple weddin...,3.0
17996,"Not to mention, we’re talking about a generati...",3.0
17997,Drove by the south Whitby covid test center to...,3.0
17998,@leahmcelrath Um... this is 10 year old techno...,3.0


In [17]:
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}
## set containing all stopwords.
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']
##Function to clean the data.
def preprocess(textdata , wordLemm):
    processedText = []
    
    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in textdata:
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub(urlPattern,' URL',tweet)
        # Replace all emojis.
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])        
        # Replace @USERNAME to 'USER'.
        tweet = re.sub(userPattern,' USER', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
        
        tweetwords = ''
        for word in tweet.split():
            if len(word)>1:
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

text, sentiment = list(df['Comment']), list(df['polarity'])

In [18]:
wordLemm = WordNetLemmatizer()
processedtext = preprocess(text , wordLemm)

In [19]:
X_train , X_test , y_train , y_test = train_test_split(processedtext , sentiment , train_size = 0.8 , test_size = 0.2 , random_state = 0)

In [20]:
pd.DataFrame(X_train , y_train).info

<bound method DataFrame.info of                                                      0
3.0            USER USER USER USER USER USER USER URL 
3.0  alright everyone this look planned this wa tod...
4.0                                USER super fashion 
3.0  USER USER USER USER USER USER USER lmfao what ...
3.0  penalty or no penalty ex prem referee verdict ...
..                                                 ...
3.0  ex referee responds to claim mitigating circum...
3.0  USER stick to hockey max once you figure that ...
3.0     should be allowed to hunt bitch for sport tbh 
3.0  new shoe at mere fraction of the price bargain...
3.0  USER yes know lot of people say that and it ac...

[14360 rows x 1 columns]>

In [21]:
#incode the object columns by Tfidf
Incoder = TfidfVectorizer(ngram_range=(1,2), max_features=1000000)
Incoder.fit(X_train)

X_train = Incoder.transform(X_train)
X_test  = Incoder.transform(X_test)

In [22]:
#Function to compare between my models to chose one
def model_Evaluate(model):
    
    # Predict values for Test dataset
    y_pred = model.predict(X_test)

    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))

In [23]:
SVCmodel = LinearSVC()
SVCmodel.fit(X_train, y_train)
model_Evaluate(SVCmodel)

              precision    recall  f1-score   support

         1.0       0.78      0.17      0.27        42
         2.0       0.59      0.26      0.36       199
         3.0       0.83      0.94      0.88      2257
         4.0       0.74      0.62      0.68       672
         5.0       0.90      0.82      0.86       420

    accuracy                           0.82      3590
   macro avg       0.77      0.56      0.61      3590
weighted avg       0.81      0.82      0.80      3590



In [24]:
LogisticModel = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
LogisticModel.fit(X_train, y_train)
model_Evaluate(LogisticModel)

              precision    recall  f1-score   support

         1.0       1.00      0.07      0.13        42
         2.0       0.64      0.14      0.22       199
         3.0       0.79      0.96      0.87      2257
         4.0       0.74      0.51      0.60       672
         5.0       0.95      0.73      0.83       420

    accuracy                           0.79      3590
   macro avg       0.82      0.48      0.53      3590
weighted avg       0.79      0.79      0.77      3590



#Logistic regression did better

In [25]:
# Saving the model and the incoder
file = open('Sentiment-LR-model.pickle','wb')
pickle.dump(LogisticModel, file)
file.close()

In [26]:
file = open('Incoder-ngram-(1,2).pickle','wb')
pickle.dump(Incoder, file)
file.close()

In [27]:
predictions = LogisticModel.predict(X_test)

output = pd.DataFrame({'text': X_test, 'sentiment': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
