# Notebook for Sentiment Analysis of Tweets using Logistic Regression

## Useful Imports

In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

## Load the dataset

In [2]:
# EDIT THE PATH OF THE CSV HERE
TWEETS_PATH = os.path.join(".", "drive", "My Drive", "Colab Notebooks", "AI2",
                           "Project1", "dataset", "SentimentTweets.csv")

def load_twitter_data(tweets_path=TWEETS_PATH):
  return pd.read_csv(tweets_path)

df = load_twitter_data(tweets_path=TWEETS_PATH)

## Take a look at the Dataset

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,target,id,date,flag,user,text
0,680949,0,2249621587,Fri Jun 19 22:41:08 PDT 2009,NO_QUERY,sukumarpant,#brokenpromises...
1,406741,0,2059003515,Sat Jun 06 16:03:21 PDT 2009,NO_QUERY,MTMSparrow,David Carradine so sad. Thai's law not sure i...
2,1337108,4,2017466467,Wed Jun 03 08:26:14 PDT 2009,NO_QUERY,itsmemcee,A @ 415 B @ 425. Tell your bro i say congrats!
3,1560887,4,2186457254,Mon Jun 15 18:52:04 PDT 2009,NO_QUERY,jdfreivald,@littlefluffycat Indeed.
4,1466295,4,2064458395,Sun Jun 07 06:19:20 PDT 2009,NO_QUERY,CrazyHan,Completed Race 4 Life in 58mins with girlies f...


In [4]:
# get rid of useless (for the task) columns
df.drop(["Unnamed: 0", "id", "date", "flag", "user"], axis=1, inplace=True)
df.head()

Unnamed: 0,target,text
0,0,#brokenpromises...
1,0,David Carradine so sad. Thai's law not sure i...
2,4,A @ 415 B @ 425. Tell your bro i say congrats!
3,4,@littlefluffycat Indeed.
4,4,Completed Race 4 Life in 58mins with girlies f...


In [5]:
# reindex so that the labels are in the end (for aesthetic reasons)
df = df.reindex(columns=["text", "target"])
df

Unnamed: 0,text,target
0,#brokenpromises...,0
1,David Carradine so sad. Thai's law not sure i...,0
2,A @ 415 B @ 425. Tell your bro i say congrats!,4
3,@littlefluffycat Indeed.,4
4,Completed Race 4 Life in 58mins with girlies f...,4
...,...,...
1279995,@zawhtutwin watching cartoon and cry? oh i do ...,4
1279996,is eating mcdonalds,4
1279997,@BestSoyLatte So sorry to hear about your car....,0
1279998,@leesherry you have done what you could. Forgi...,4


In [6]:
# check which are the unique values for the target column (labels)
df.target.unique()

array([0, 4])

In [7]:
# convert the values to 0 (negative) and 1 (positive), again for aesthetic reasons
df["target"] = df["target"].apply(lambda x: int(x != 0))
df.target.unique()

array([0, 1])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1280000 entries, 0 to 1279999
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1280000 non-null  object
 1   target  1280000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 19.5+ MB


### Now get a list with all the sentences

In [9]:
sentences = df["text"].tolist()
labels = df["target"].tolist()

## Create some function to remove useless features (words) from the dataset using regexes

In [10]:
def remove_urls(sentence):
  # remove urls that start with http, https
  no_https = re.sub(r"https?:?[\/]?[\/]?[\S]*", '', sentence, flags=re.MULTILINE)
  # remove urls that have extensions, e.g. myspace.com
  no_extensions = re.sub(r"[\S]*\.(com|org|net)[\S]*", '', no_https, flags=re.MULTILINE)
  # remove urls that are in the form "www.somepath"
  no_wwws = re.sub(r"www\.[\S]+", '', no_extensions, flags=re.MULTILINE)

  # return the result
  result = no_wwws
  return result

In [11]:
def remove_twitter_tags(sentence):
  # remove tags of the form: @jason @ maria, etc
  result = re.sub(r"@[\s]*[\S]*", '', sentence, flags=re.MULTILINE)
  return result

In [12]:
def remove_retweet_token(sentence):
  # remove retweet text "RT"
  result = re.sub(r"(rt|RT)[\s]+", '', sentence, flags=re.MULTILINE)
  return result

In [13]:
def remove_tickers(sentence):
  # remove tickers like $GE
  result = re.sub(r"\$\w*", '', sentence, flags=re.MULTILINE)
  return result

In [14]:
def remove_most_punctuation(sentence):
  # substitue most punctuation characters with a space, except for some that either are emojis or help understand the sentiment (e.g. !, ?)
  result = re.sub(r"(#|\$|%|\^|&|\*|-|_|\+|=|,|\.|<|>|\/|;|\"|`|~|\[|\]|{|})+", ' ', sentence, flags=re.MULTILINE)
  # separate ":@" from text
  result = re.sub(r":[\s]*@+", " :@ ", result, flags=re.MULTILINE)
  # separate ":)" from text
  result = re.sub(r":[\s]*\)+", " :) ", result, flags=re.MULTILINE)
  # separate ":(" from text
  result = re.sub(r":[\s]*\(+", " :( ", result, flags=re.MULTILINE)
  # separate ":D" from text
  result = re.sub(r":[\s]*D+", " :D ", result, flags=re.MULTILINE)
  # substitute apostrophes (') with empty string (e.g.: don't -> dont)
  result = re.sub(r"'+", '', result, flags=re.MULTILINE)
  # now substitute groups of exclamation marks (!) with one exclamation mark ( ! ) separated by spaces
  result = re.sub(r"!+", " ! ", result, flags=re.MULTILINE)
  # now substitute groups  question marks (?)  with one question mark (?) separated by spaces
  result = re.sub(r"\?+", " ? ", result, flags=re.MULTILINE)
  return result

In [15]:
def remove_numbers(sentence):
  # substitute all numbers with a space
  result = re.sub(r"[0-9]+", ' ', sentence, flags=re.MULTILINE)
  return result

In [16]:
def remove_multiple_whitespace(sentence):
  # substitutes groups of whitespaces with just a space
  result = re.sub(r"[\s]+", ' ', sentence, flags=re.MULTILINE)
  return result

In [17]:
def strip_whitespaces(sentence):
  # removes leading and trailing whitespace
  result = sentence.strip()
  return result

In [18]:
def convert_to_lowercase(sentence):
  # convert every sentence to lowercase
  result = sentence.lower()
  return result

In [19]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(sentence, language="english"):
  stop_words = stopwords.words(language)
  result = ' '.join(list(filter(lambda word: word not in stop_words, sentence.split())))
  return result

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
from nltk.stem import PorterStemmer

def stem_words(sentence):
  stemmer = PorterStemmer()
  result = ' '.join(list(map(lambda word: stemmer.stem(word), sentence.split())))
  return result

## Now create a pipeline that all tweets should follow

In [21]:
def main_pipeline(sentences):
  processed_sentences = []

  cnt = 0
  for sentence in sentences:
    result = remove_urls(sentence)
    result = remove_twitter_tags(result)
    result = remove_retweet_token(result)
    result = remove_tickers(result)
    result = remove_most_punctuation(result)
    result = remove_numbers(result)
    result = remove_multiple_whitespace(result)
    result = strip_whitespaces(result)
    result = convert_to_lowercase(result)
    result = remove_stopwords(result)
    result = stem_words(result)

    processed_sentences.append(result)

  return processed_sentences

In [22]:
# should take about ~ 10 mins to run
preprocessed_sentences = main_pipeline(sentences)

## Let's compare the sentences to see the changes made

In [23]:
sentences[:15]

['#brokenpromises... ',
 "David Carradine  so sad. Thai's law not sure if it's fowl play? How many people hang themselves and tie their testicles?",
 'A @ 415 B @ 425. Tell your bro i say congrats! ',
 '@littlefluffycat  Indeed.',
 'Completed Race 4 Life in 58mins with girlies from work... was fun but bloody hot!! Also bumped into some familiar faces  -well done guys!!',
 'Woot truck fixed!!! Now workin  http://myloc.me/4NiF',
 '@BLAQSHEEPCEO   yea I melt at that lol ',
 'wants to do something radical...but not sure what that something is. ',
 '@brlamb scratch that, @bgilgoff tweeted it. Channel is full!  #celc2009',
 "I am sore everywhere. Also found out an assessment i had today i actually should've had last week. That teacher hates me too ",
 "Hmm... well I've now drawn him, but it doesnt look like him ",
 'So tired ... at least I got the blog done at http://vasmusic.wordpress.com/ . . . finally could log into the blogspot but now its no use ',
 'drumming isnt on today  that means i

In [24]:
preprocessed_sentences[:15]

['brokenpromis',
 'david carradin sad thai law sure fowl play ? mani peopl hang tie testicl ?',
 'b tell bro say congrat !',
 'inde',
 'complet race life min girli work fun bloodi hot ! also bump familiar face well done guy !',
 'woot truck fix ! workin',
 'yea melt lol',
 'want someth radic sure someth',
 'scratch tweet channel full ! celc',
 'sore everywher also found assess today actual shouldv last week teacher hate',
 'hmm well ive drawn doesnt look like',
 'tire least got blog done final could log blogspot use',
 'drum isnt today mean play nicol )',
 'awww didnt ask ?',
 'drove fisher feel sad']

In [25]:
labels[:15]

[0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0]

### Make sure that the Datasets have equal lengths with each other

In [26]:
print(len(sentences))
print(len(preprocessed_sentences))
print(len(labels))

1280000
1280000
1280000


### Good, now let's check if empty sentences have been created from the pipeline, and if yes, remove those rows from the dataset.

In [27]:
indices_of_empty_sentences = [index for index in range(len(preprocessed_sentences)) if not preprocessed_sentences[index]]
print(len(indices_of_empty_sentences))

4357


### We have many empty sentences in our dataset, which should be removed. Let's make a function for that.

In [28]:
def remove_empty_training_examples(sentences, labels):
  m = len(sentences)
  filtered_sentences = []
  filtered_labels = []
  for index in range(m):
    if sentences[index]:
      filtered_sentences.append(sentences[index])
      filtered_labels.append(labels[index])

  return filtered_sentences, filtered_labels

In [29]:
X, Y = remove_empty_training_examples(preprocessed_sentences, labels)
print(len(X))
print(len(Y))

1275643
1275643


In [30]:
X[:10]

['brokenpromis',
 'david carradin sad thai law sure fowl play ? mani peopl hang tie testicl ?',
 'b tell bro say congrat !',
 'inde',
 'complet race life min girli work fun bloodi hot ! also bump familiar face well done guy !',
 'woot truck fix ! workin',
 'yea melt lol',
 'want someth radic sure someth',
 'scratch tweet channel full ! celc',
 'sore everywher also found assess today actual shouldv last week teacher hate']

In [31]:
Y[:10]

[0, 0, 1, 1, 1, 0, 1, 0, 0, 0]

In [32]:
## Now it's time to create the training-testing sets

In [33]:
from sklearn.model_selection import train_test_split

# since we have 1 million instances, 2% for validation-testing is fine
train_sentences, val_test_sentences, train_labels, val_test_labels = train_test_split(X, Y, test_size=0.02, stratify=Y, random_state=42)
val_sentences, test_sentences, val_labels, test_labels = train_test_split(val_test_sentences, val_test_labels, test_size=0.5, stratify=val_test_labels, random_state=42)

In [34]:
train_sentences[-15:]

['watch love ny go grab cooki time fix nom nom nom bulk load sup',
 'someon miss miss',
 'thx ice type w left hand good',
 'wait train primark mum',
 'blade glori there noth like nun skate',
 'glass merlot bon iver settl even perfect',
 'headach start back head forehead oww',
 ': inde',
 'aww hope get better ive alway found wed child program inspir heartfelt',
 'msn may go youtub soon mayday tomorrow yay ! school',
 'feeel bad leah hannah collin maddi cara joel aden forgot littl girl name bad parent',
 'welcom !',
 'get readi eu elect head poll place poll worker dont think busi day',
 'count bless caus ive count everi lie im beyond bore',
 'first time twitter totali love susanboyl surpris see lost mayb isnt al bad']

In [35]:
train_labels[-15:]

[1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1]

## Finalize the sets and scale them

In [36]:
# this should take about a min to run
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

X_train = tfidf.fit_transform(train_sentences)
y_train = train_labels

X_val = tfidf.transform(val_sentences)
y_val = val_labels

X_test = tfidf.transform(test_sentences)
y_test = test_labels

In [37]:
X_train[:10]

<10x201321 sparse matrix of type '<class 'numpy.float64'>'
	with 94 stored elements in Compressed Sparse Row format>

## Define a custom Grid Search function (GridSearchCV() is having trouble working)

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

def CustomGridSearch(X_train, y_train, X_val, y_val, params, verbose=0):
  c_values = params["C"]
  penalties = params["penalties"]
  max_iterations = params["max_iter"]

  # use dictionaries to store information
  models = {}
  validation_accuracies = {}

  for c in c_values:
    for penalty in penalties:
      if verbose:
        print("Now checking model with configuration: C: {}, penalty: {}\n".format(c, penalty))

      models[(c, penalty)] = LogisticRegression(penalty=penalty, solver="saga", max_iter=max_iterations)
      models[(c, penalty)].fit(X_train, y_train)

      y_pred = models[(c, penalty)].predict(X_val)
      validation_accuracies[(c,penalty)] = f1_score(y_val, y_pred, average='macro')

  # find the best configuration of hyperparameters
  min_loss = float('inf')
  best_configuration = None

  for configuration in validation_accuracies.keys():
    if validation_accuracies[configuration] < min_loss:
      min_loss = validation_accuracies[configuration]
      best_configuration = configuration

  # return the best configuration and the best model
  return best_configuration, models[best_configuration]

### Define here the Hyperprameters that we want to tune in the model

In [39]:
hyperparameters = {
    "C": [0.01, 0.1, 1, 10],
    "penalties": ["l2"],
    "max_iter": 200
}

## Time for Training!

In [40]:
from datetime import datetime

# keep track of when the training started
start = datetime.now()
dt_string = start.strftime("%d/%m/%Y %H:%M:%S")
print("start time =", dt_string, "\n")

# train the models
best_configuration, best_model = CustomGridSearch(X_train, y_train, X_val, y_val, hyperparameters, verbose=1)

# see how much time it took
end = datetime.now()
dt_string = end.strftime("%d/%m/%Y %H:%M:%S")
print("end time =", dt_string)

start time = 12/10/2020 22:59:55 

Now checking model with configuration: C: 0.01, penalty: l2

Now checking model with configuration: C: 0.1, penalty: l2

Now checking model with configuration: C: 1, penalty: l2

Now checking model with configuration: C: 10, penalty: l2

end time = 12/10/2020 23:01:50


### Print some info about it and we are done!

In [41]:
from sklearn.metrics import classification_report

# make the predictions for the Test set
y_pred = best_model.predict(X_test)

print("The best classifier has the hyperprameters: (lambda, penalty) = {}".format(best_configuration))
print("Its f1 score in the test set is: {}".format(f1_score(y_test, y_pred, average='macro')))
print("The classification report for the Test set can be found below:\n")
print(classification_report(y_test, y_pred))

The best classifier has the hyperprameters: (lambda, penalty) = (0.1, 'l2')
Its f1 score in the test set is: 0.7810269324826065
The classification report for the Test set can be found below:

              precision    recall  f1-score   support

           0       0.79      0.77      0.78      6372
           1       0.77      0.79      0.78      6385

    accuracy                           0.78     12757
   macro avg       0.78      0.78      0.78     12757
weighted avg       0.78      0.78      0.78     12757



## Make a prediction for fun

In [42]:
# of course positive prediction
best_model.predict(tfidf.transform(["assignment has finished finalli"]))

array([1])

In [43]:
# make also a negative one
best_model.predict(tfidf.transform(["assignment hard"]))

array([0])