In [926]:
import nltk
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import seaborn as sns

In [927]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train = df_train.drop(['textID','selected_text'],axis=1)
df_test = df_test.drop(['textID'],axis=1)

Now we only have positive and negative sentiments

In [928]:
df_train = df_train[df_train['sentiment']!='neutral']
df_train_features = df_train['text']
df_train_labels = df_train['sentiment']

In [929]:
df_train_features

1            Sooo SAD I will miss you here in San Diego!!!
2                                my boss is bullying me...
3                           what interview! leave me alone
4         Sons of ****, why couldn`t they put them on t...
6        2am feedings for the baby are fun when he is a...
                               ...                        
27475                                       enjoy ur night
27476     wish we could come see u on Denver  husband l...
27477     I`ve wondered about rake to.  The client has ...
27478     Yay good for both of you. Enjoy the break - y...
27479                           But it was worth it  ****.
Name: text, Length: 16363, dtype: object

In [930]:
df_train_labels

1        negative
2        negative
3        negative
4        negative
6        positive
           ...   
27475    positive
27476    negative
27477    negative
27478    positive
27479    positive
Name: sentiment, Length: 16363, dtype: object

<h1>Look at the data and see if we have any missing values</h1>

In [931]:
display(df_train)

Unnamed: 0,text,sentiment
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
6,2am feedings for the baby are fun when he is a...,positive
...,...,...
27475,enjoy ur night,positive
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive


In [932]:
df_train.describe()

Unnamed: 0,text,sentiment
count,16363,16363
unique,16363,2
top,Sooo SAD I will miss you here in San Diego!!!,positive
freq,1,8582


In [933]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16363 entries, 1 to 27479
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       16363 non-null  object
 1   sentiment  16363 non-null  object
dtypes: object(2)
memory usage: 383.5+ KB


<h3>Since it is quite small, we can just drop it</h3>

In [934]:
df_train.isna().sum()

text         0
sentiment    0
dtype: int64

In [935]:
df_test.isna().sum()

text         0
sentiment    0
dtype: int64

In [936]:
df_train.dropna(inplace=True)

In [937]:
df_train.isna().sum()

text         0
sentiment    0
dtype: int64

<h3>We can look more in detail into what these tweets contain</h3>

We can see that we have cases in which the symbol ****  is being used to block a negative word. Often use in curse words for example f*** where, we can still see that the word does not have to entirely be censor.

In [938]:
df_train.shape

(16363, 2)

In [939]:
df_train['text'].head()

1        Sooo SAD I will miss you here in San Diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     Sons of ****, why couldn`t they put them on t...
6    2am feedings for the baby are fun when he is a...
Name: text, dtype: object

We download the stopwords that we will use to preprocess the data

In [940]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daguila/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [941]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

<h1>Preprocess</h1>

<h3>Remove hyper links, marks and styles</h3>

In [942]:
def removeHyperMarksStyles(df,text_feature):
    tweetPreprocessing = ""
    for index, row in df.iterrows():
        tweetPreprocessing = re.sub(r'^RT[\s]+','',row[text_feature]) #removes retweet text
        tweetPreprocessing = re.sub(r'https?://[^\s\n\r]+','',tweetPreprocessing) #removes hyperlinks
        tweetPreprocessing = re.sub(r'#', '', tweetPreprocessing) #removes hashtags
        df.at[index,text_feature] = tweetPreprocessing
    return df

In [943]:
df_train = removeHyperMarksStyles(df_train,'text')

Since we are using tweets, nltk has a great tokenizer for it that allows us to remove caps, strips handles and reduces the length

In [944]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) 

<h3>Tokenize</h3>

In [945]:
def getTokenizeList(df,text_feature,tokenizer):
    tokenizeList = []
    for index, row in df.iterrows():
        tokenizeList.append(tokenizer.tokenize(row[text_feature]))
    return tokenizeList

All the tweets have been tokenized

In [946]:
tweet_tokens = getTokenizeList(df_train,'text',tokenizer)
len(tweet_tokens)

16363

Remove Stop Words with NLTK

In [947]:
stopwords_english = stopwords.words('english')

In [948]:
def cleanTweetsOfStopWords(tweet_tokens_list):
    for index, tweet_tokens in enumerate(tweet_tokens_list):
        clean_token = []
        for word in tweet_tokens:
            if(word not in stopwords_english and word not in string.punctuation):
                clean_token.append(word)
        tweet_tokens_list[index] = clean_token
    return tweet_tokens_list

In [949]:
tweet_tokens[0]

['sooo',
 'sad',
 'i',
 'will',
 'miss',
 'you',
 'here',
 'in',
 'san',
 'diego',
 '!',
 '!',
 '!']

We can see how we have remove stop words from the list

In [950]:
tweet_clean_tokens = cleanTweetsOfStopWords(tweet_tokens)
tweet_clean_tokens[0]

['sooo', 'sad', 'miss', 'san', 'diego']

Instead of Stemming we are using Lemmatization since it is more accurate, as a trade-off it requires more computing power.

In [951]:
from nltk.stem import WordNetLemmatizer

In [952]:
lemmatizer = WordNetLemmatizer()

In [953]:
def getLemmatizeList(tweet_clean_tokens_list,lemmatizer):
    for index, tweet_clean_tokens in enumerate(tweet_clean_tokens_list):
        tweet_lemm = []
        for word in tweet_clean_tokens:
            tweet_lemm.append(lemmatizer.lemmatize(word))
        tweet_clean_tokens_list[index] = tweet_lemm
    return tweet_clean_tokens_list

In [954]:
processed_tweets_lemm = getLemmatizeList(tweet_clean_tokens,lemmatizer)

In [955]:
def buildFreq(tweets, labels):
    labelist = np.squeeze(labels).tolist()
    freqs = {}
    #zip gives tuples from both of the given input
    for label, tweet in zip(labelist, tweets):
        for word in tweet:
            pair = (word, label)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [956]:
freqs = buildFreq(processed_tweets_lemm,df_train_labels)
freqs

{('sooo', 'negative'): 93,
 ('sad', 'negative'): 397,
 ('miss', 'negative'): 442,
 ('san', 'negative'): 8,
 ('diego', 'negative'): 6,
 ('bos', 'negative'): 8,
 ('bullying', 'negative'): 1,
 ('...', 'negative'): 1628,
 ('interview', 'negative'): 9,
 ('leave', 'negative'): 54,
 ('alone', 'negative'): 35,
 ('son', 'negative'): 16,
 ('put', 'negative'): 40,
 ('release', 'negative'): 5,
 ('already', 'negative'): 102,
 ('bought', 'negative'): 13,
 ('2am', 'positive'): 2,
 ('feeding', 'positive'): 3,
 ('baby', 'positive'): 66,
 ('fun', 'positive'): 348,
 ('smile', 'positive'): 39,
 ('coo', 'positive'): 1,
 ('journey', 'positive'): 8,
 ('wow', 'positive'): 111,
 ('...', 'positive'): 1369,
 ('u', 'positive'): 435,
 ('became', 'positive'): 4,
 ('cooler', 'positive'): 4,
 ('hehe', 'positive'): 37,
 ('possible', 'positive'): 8,
 ('really', 'positive'): 309,
 ('like', 'positive'): 430,
 ('song', 'positive'): 115,
 ('love', 'positive'): 946,
 ('story', 'positive'): 12,
 ('taylor', 'positive'): 17,
 

In [957]:
def sigmoid(z):
    h = 1/(1+np.exp(-z))
    return h

In [958]:
def gradientDescent(x, y, theta, alpha, num_iters):
    m = x.shape[0]

    for i in range(0, num_iters):
        z = np.dot(x,theta)
        h = sigmoid(z)

        J = float(-1)/float(m) *(np.dot(y.transpose(),np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))
        theta = theta - (alpha/m)* np.dot(x.transpose(),(h-y))
        
    J = float(J)
    return J, theta

In [959]:
def extract_features(tweetL, freqs):
    x = np.zeros(3)

    x[0] = 1 #bias
    for word in tweetL:
        pkey = (word, "positive")
        nkey = (word, "negative")

        x[1] += freqs.get(pkey,0)
        x[2] += freqs.get(nkey,0)

    x = x[None, :]
    assert(x.shape == (1,3)) #tests if this shape is correct
    return x
        

In [960]:
extract_features(processed_tweets_lemm[0],freqs=freqs)

array([[  1., 154., 946.]])

In [961]:
feature_list = np.zeros((len(df_train_features),3))

for index, each_tweet_list in enumerate(processed_tweets_lemm):
    feature_list[index,:] = extract_features(processed_tweets_lemm[index],freqs=freqs)
feature_list

array([[1.000e+00, 1.540e+02, 9.460e+02],
       [1.000e+00, 1.374e+03, 1.637e+03],
       [1.000e+00, 6.600e+01, 9.800e+01],
       ...,
       [1.000e+00, 4.010e+02, 2.380e+02],
       [1.000e+00, 1.825e+03, 7.170e+02],
       [1.000e+00, 3.100e+01, 1.100e+01]])

<h1>Train the Model</h1>

In [962]:
feature_list.shape

(16363, 3)

In [963]:
df_train_labels.shape

(16363,)

In [964]:
df_train_labels = df_train_labels.replace(["positive","negative"], [1,0])
df_train_labels.shape

(16363,)

In [965]:
df_train_labels = df_train_labels.values.reshape(16363,1)
type(df_train_labels)
df_train_labels.shape

(16363, 1)

In [966]:
df_train_feature_list = feature_list

In [967]:
J, theta = gradientDescent(feature_list,df_train_labels, np.zeros((3,1)),1e-9,20000)

  J = float(J)


Had to change the series of the labels into an nparray of 1d

In [968]:
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t,8) for t in np.squeeze(theta)]}")

The cost after training is 0.55906374.
The resulting vector of weights is [-3.9e-07, 0.00106445, -0.00112081]


<h1>Testing</h1>

In [969]:
def predict_tweet(tweet,freqs, theta):
    x = extract_features(tweet,freqs)
    y_pred = sigmoid(np.dot(x,theta))
    return y_pred

While testing we can see that some tweets that are shown as bad have given positive results. By increasing iterations we have created a more accurate model.

In [970]:
my_tweet = 'I am bad'
predict_tweet(my_tweet,freqs,theta)

array([[0.49520517]])

In [971]:
def test_logistic_regression(test_x, test_y,freqs, theta, predict_tweet=predict_tweet):
    y_hat = []
    for tweet in test_x:
        y_pred = predict_tweet(tweet,freqs,theta)
        
        if y_pred > 0.5:
            y_hat.append(1.0)
        else:
            y_hat.append(0.0)
    
    accuracy = (np.asarray(y_hat) == np.squeeze(test_y)).sum()/len(test_x)

    return accuracy
    

In [972]:
accuracy = test_logistic_regression(df_train_features,df_train_labels,freqs,theta)
print(f"Model's Accuracy = {accuracy:.4f}")

Model's Accuracy = 0.5297
