## NLP with Logistic Regression

We will perform sentiment analysis on tweet dataset using Logistic Regression<br>
The steps ionvolved in this are<br>
1. preprocessing (tokenization, removing stopwords, stemming)
2. Frequency count of each word in the each category i.e. positive and negative
3. vectorization of the words
4. training the model

In [14]:
import nltk
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import PorterStemmer
import string
import re
import numpy as np

In [15]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [16]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [17]:
tweets = positive_tweets + negative_tweets

In [18]:
labels = np.append(np.ones((len(positive_tweets), 1)), np.zeros((len(negative_tweets), 1)), axis=0)

In [19]:
labels.shape

(10000, 1)

# Preprocessing

### Remove unwanted characters

In [20]:
tweet = tweets[2277]
tweet

'My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i'

In [21]:
tweet = re.sub(r'^RT[\s]+', '', tweet)
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
tweet = re.sub(r'#', '', tweet)
tweet

'My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off… '

### Tokenize

In [22]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
tweet_tokens = tokenizer.tokenize(tweet)
tweet_tokens

['my',
 'beautiful',
 'sunflowers',
 'on',
 'a',
 'sunny',
 'friday',
 'morning',
 'off',
 ':)',
 'sunflowers',
 'favourites',
 'happy',
 'friday',
 'off',
 '…']

### Remove stopwords

In [23]:
stopwords_english = stopwords.words('english')
clean_tweet = []

for word in tweet_tokens:
    if(word not in stopwords_english and word not in string.punctuation):
        clean_tweet.append(word)

clean_tweet

['beautiful',
 'sunflowers',
 'sunny',
 'friday',
 'morning',
 ':)',
 'sunflowers',
 'favourites',
 'happy',
 'friday',
 '…']

### Stemming

In [24]:
stemmer = PorterStemmer()
stem_tweet = []

for word in clean_tweet:
    stem_word = stemmer.stem(word)
    stem_tweet.append(stem_word)

stem_tweet

['beauti',
 'sunflow',
 'sunni',
 'friday',
 'morn',
 ':)',
 'sunflow',
 'favourit',
 'happi',
 'friday',
 '…']

> now we will create a function which will perform all these steps

In [25]:
def process_tweet(tweet):
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)

    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    stopwords_english = stopwords.words('english')
    clean_tweet = []

    for word in tweet_tokens:
        if(word not in stopwords_english and word not in string.punctuation):
            clean_tweet.append(word)

    stemmer = PorterStemmer()
    stem_tweet = []

    for word in clean_tweet:
        stem_word = stemmer.stem(word)
        stem_tweet.append(stem_word)

    return stem_tweet

# Feature Engineering and Vectorization

### Frequency Count

In [26]:
tweet = tweets[2277]
after_preprocessing = process_tweet(tweet)
after_preprocessing

['beauti',
 'sunflow',
 'sunni',
 'friday',
 'morn',
 ':)',
 'sunflow',
 'favourit',
 'happi',
 'friday',
 '…']

In [28]:
tweet_label = labels[2277]
tweet_label

array([1.])

In [31]:
freqs = {}
for word in after_preprocessing:
    pair = (word, 1)
    if(pair in freqs.keys()):
        freqs[pair] += 1
    else:
        freqs[pair] = 1

freqs

{('beauti', 1): 1,
 ('sunflow', 1): 2,
 ('sunni', 1): 1,
 ('friday', 1): 2,
 ('morn', 1): 1,
 (':)', 1): 1,
 ('favourit', 1): 1,
 ('happi', 1): 1,
 ('…', 1): 1}

> we will calculate frequency of each word in each label

In [36]:
def build_freqs(tweets,labels):
    freqs = {}
    label_list = np.squeeze(labels).tolist()
    for label, tweet in zip(label_list, tweets):
        for word in process_tweet(tweet):
            pair = (word, label)
            if(pair in freqs.keys()):
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

In [37]:
freqs = build_freqs(tweets, labels)

In [38]:
freqs

{('followfriday', 1.0): 25,
 ('top', 1.0): 32,
 ('engag', 1.0): 7,
 ('member', 1.0): 16,
 ('commun', 1.0): 33,
 ('week', 1.0): 83,
 (':)', 1.0): 3568,
 ('hey', 1.0): 76,
 ('jame', 1.0): 7,
 ('odd', 1.0): 2,
 (':/', 1.0): 5,
 ('pleas', 1.0): 97,
 ('call', 1.0): 37,
 ('contact', 1.0): 7,
 ('centr', 1.0): 2,
 ('02392441234', 1.0): 1,
 ('abl', 1.0): 8,
 ('assist', 1.0): 1,
 ('mani', 1.0): 33,
 ('thank', 1.0): 620,
 ('listen', 1.0): 16,
 ('last', 1.0): 47,
 ('night', 1.0): 68,
 ('bleed', 1.0): 2,
 ('amaz', 1.0): 51,
 ('track', 1.0): 5,
 ('scotland', 1.0): 2,
 ('congrat', 1.0): 21,
 ('yeaaah', 1.0): 1,
 ('yipppi', 1.0): 1,
 ('accnt', 1.0): 2,
 ('verifi', 1.0): 2,
 ('rqst', 1.0): 1,
 ('succeed', 1.0): 1,
 ('got', 1.0): 69,
 ('blue', 1.0): 9,
 ('tick', 1.0): 1,
 ('mark', 1.0): 1,
 ('fb', 1.0): 6,
 ('profil', 1.0): 2,
 ('15', 1.0): 5,
 ('day', 1.0): 246,
 ('one', 1.0): 129,
 ('irresist', 1.0): 2,
 ('flipkartfashionfriday', 1.0): 17,
 ('like', 1.0): 233,
 ('keep', 1.0): 68,
 ('love', 1.0): 400,


### Vectorization

we will create a vector of dimension 1x3 for each tweet<br>
where 1st element will be bias<br>
2nd element will be sum of frequencies of word in when label is positive<br>
3rd element will be sum of frequencies of word in when label is negative<br>
x(i) = [1, sum of frequencies of word in when label is positive, sum of frequencies of word in when label is negative]

In [40]:
tweet = tweets[2277]
processed = process_tweet(tweet)
processed

['beauti',
 'sunflow',
 'sunni',
 'friday',
 'morn',
 ':)',
 'sunflow',
 'favourit',
 'happi',
 'friday',
 '…']

In [44]:
# vectorizing this tweet
x = np.zeros(3)
x[0] = 1 # bias term
x

array([1., 0., 0.])

In [45]:
for word in processed:
    if (word, 1.0) in freqs:
        x[1] += freqs[(word, 1.0)]
    if (word, 0.0) in freqs:
        x[2] += freqs[(word, 0.0)]

x

array([1.000e+00, 4.223e+03, 1.190e+02])

> in this manner we will vectorize all the tweets using vectorize function

In [46]:
def vectorize(tweets, freqs):
    processed = process_tweet(tweets)
    x = np.zeros(3)
    x[0] = 1
    for word in processed:
        if (word, 1.0) in freqs:
            x[1] += freqs[(word, 1.0)]
        if (word, 0.0) in freqs:
            x[2] += freqs[(word, 0.0)]
    return x

# Training the model

> creating the traing features and labels

In [48]:
train_pos = positive_tweets[:4000]
train_neg = negative_tweets[:4000]
train_x = train_pos + train_neg
train_y =  np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)

test_pos = positive_tweets[4000:]
test_neg = negative_tweets[4000:]
test_x = test_pos + test_neg
test_y =  np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [50]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i] = vectorize(train_x[i], freqs)
X

array([[1.000e+00, 3.764e+03, 7.200e+01],
       [1.000e+00, 4.464e+03, 5.170e+02],
       [1.000e+00, 3.759e+03, 1.600e+02],
       ...,
       [1.000e+00, 1.840e+02, 9.890e+02],
       [1.000e+00, 2.560e+02, 4.855e+03],
       [1.000e+00, 2.400e+02, 4.967e+03]])

In [51]:
Y = train_y