In [3]:
# import Libraries
import pandas as pd
import numpy as np
import re
import pickle
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
df = pd.read_csv('/content/clean_tweets.csv')

# Removing the unnecessary columns.
df = df[['sentiment','text']]
df.text=df.text.astype(str)

In [18]:
df.isna().sum()

sentiment    0
text         0
dtype: int64

In [19]:
df.dropna()

Unnamed: 0,sentiment,text
0,1,so i m going to start a new trend instead of t...
1,0,edgefest or maybe since you re driving
2,0,i need something exciting to happen xo
3,0,what happened last night i wasnt too impressed...
4,0,i hate my nose from the front
...,...,...
159820,1,thank you she is so great more bragging
159821,0,it s cold out no beach today
159822,0,yo facebook is porno n n haha im playin but th...
159823,1,tis a pink day today


In [20]:
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}
## set containing all stopwords.
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']
##Function to clean the data.
def preprocess(textdata , wordLemm):
    processedText = []
    
    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in textdata:
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub(urlPattern,' URL',tweet)
        # Replace all emojis.
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])        
        # Replace @USERNAME to 'USER'.
        tweet = re.sub(userPattern,' USER', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
        
        tweetwords = ''
        for word in tweet.split():
            if len(word)>1:
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

text, sentiment = list(df['text']), list(df['sentiment'])

In [21]:
wordLemm = WordNetLemmatizer()
processedtext = preprocess(text , wordLemm)

In [22]:
X_train , X_test , y_train , y_test = train_test_split(processedtext , sentiment , train_size = 0.8 , test_size = 0.2 , random_state = 0)

In [23]:
pd.DataFrame(X_train , y_train).info

<bound method DataFrame.info of                                                     0
1   hungry hungry hippo lol ve been vegeterian fer...
1                                 what wa the advice 
0   me too they were kind of cute stuntling there ...
1   do you have adventure trip photo up and take p...
1                             just fixed my computer 
..                                                ...
0             it official hand broke got big as cast 
1                          no nyc see you in la then 
0                           so dead tired once again 
1   just watched the st episode of nurse jackie on...
1   aahh finally an avatar in case you are interes...

[127860 rows x 1 columns]>

In [24]:
#incode the object columns by Tfidf
Incoder = TfidfVectorizer(ngram_range=(1,2), max_features=1000000)
Incoder.fit(X_train)

X_train = Incoder.transform(X_train)
X_test  = Incoder.transform(X_test)

In [25]:
#Function to compare between my models to chose one
def model_Evaluate(model):
    
    # Predict values for Test dataset
    y_pred = model.predict(X_test)

    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))

In [26]:
SVCmodel = LinearSVC()
SVCmodel.fit(X_train, y_train)
model_Evaluate(SVCmodel)

              precision    recall  f1-score   support

           0       0.79      0.80      0.80     16057
           1       0.80      0.79      0.79     15908

    accuracy                           0.79     31965
   macro avg       0.79      0.79      0.79     31965
weighted avg       0.79      0.79      0.79     31965



In [27]:
LogisticModel = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
LogisticModel.fit(X_train, y_train)
model_Evaluate(LogisticModel)

              precision    recall  f1-score   support

           0       0.80      0.81      0.80     16057
           1       0.80      0.79      0.80     15908

    accuracy                           0.80     31965
   macro avg       0.80      0.80      0.80     31965
weighted avg       0.80      0.80      0.80     31965



#Logistic regression did better

In [28]:
# Saving the model and the incoder
file = open('Sentiment-LR-model.pickle','wb')
pickle.dump(LogisticModel, file)
file.close()

In [29]:
file = open('Incoder-ngram-(1,2).pickle','wb')
pickle.dump(Incoder, file)
file.close()

In [30]:
predictions = LogisticModel.predict(X_test)

output = pd.DataFrame({'text': X_test, 'sentiment': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
