# Logistic Regression sentiment analysis
This is just a chill and minimal sentiment analysis with numpy.<br>
Seriously you can't go simpler than that!

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression



In [2]:
dataframe = pd.read_csv("sentiment_analysis.csv")
dataframe.head()

Unnamed: 0,Year,Month,Day,Time of Tweet,text,sentiment,Platform
0,2018,8,18,morning,What a great day!!! Looks like dream.,positive,Twitter
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",positive,Facebook
2,2017,8,18,night,Don't angry me,negative,Facebook
3,2022,6,8,morning,We attend in the class just for listening teac...,negative,Facebook
4,2022,6,8,noon,"Those who want to go, let them go",negative,Instagram


Thanks to the guy who collected this data, but we're gonna use only sentiment and text.<br>
Also, since Deep Learning.AI week 1 introduces only negative/positive classification with a logist regression we'll cut some data out

In [3]:
data = dataframe[(dataframe["sentiment"] == "positive")
                | (dataframe["sentiment"] == "negative")][["text", "sentiment"]]
data.head()

Unnamed: 0,text,sentiment
0,What a great day!!! Looks like dream.,positive
1,"I feel sorry, I miss you here in the sea beach",positive
2,Don't angry me,negative
3,We attend in the class just for listening teac...,negative
4,"Those who want to go, let them go",negative


In [4]:
data["sentiment"].value_counts()

sentiment
positive    166
negative    134
Name: count, dtype: int64

### Data Processing
Okay, this is the core part.<br>
Week 1 introduces **stop word removal** and **stemming**.
In Python we have a **nltk** module (I guess it's for Natural Language Toolkit) with all these preprocessing steps.<br>
It's impossibly hard to do all of them from scratch (it requires special stemming algorithm and stop words list building), so let's use this module

In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [18]:
nltk.download('punkt_tab')  # Just Needed

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/cheslaff/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [19]:
tokenizer = RegexpTokenizer(r"\w+")

`RegexpTokenizer` in our case splits sentence into words removing all punctuation

In [20]:
nltk.download("stopwords")
print(stopwords.words("english"))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cheslaff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [157]:
X = np.array(data["text"])
y = np.array(data["sentiment"])

In [158]:
X[0]

'What a great day!!! Looks like dream.'

In [159]:
stopwords_set = set(stopwords.words("english"))
for i in range(X.shape[0]):
    filtered = [w for w in tokenizer.tokenize(X[i]) if w.lower() not in stopwords_set]
    X[i] = filtered

In [160]:
X[0]

['great', 'day', 'Looks', 'like', 'dream']

**Stemming Time!**

In [161]:
from nltk.stem import PorterStemmer

In [162]:
ps = PorterStemmer()

In [163]:
for i in range(X.shape[0]):
    stemmed = [ps.stem(w) for w in X[i]]
    X[i] = stemmed

In [164]:
X[0]

['great', 'day', 'look', 'like', 'dream']

### Model Building!
Okay, in the course they introduced us to the Logistic Regression based sentiment analysis.<br>
What data do we pass?<br>
For each word we save the frequency in positive sentiments and in negative sentiments (number of times it occurs).<br>
Then, when passing a sequence we sum up all the positive and all the negative scores - tadam! We have 2 features in total

In [165]:
y = np.array([int(lbl == "positive") for lbl in y])  # Preprocess targets

In [166]:
y

array([1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [167]:
positive_scores = {}
negative_scores = {}

In [168]:
vocab = set()
for sentence in X:
    for word in sentence:
        if word not in vocab:
            vocab.add(word)

for word in vocab:
    for sentence, label in zip(X, y):
        if (word in sentence) and (label == 1):
            positive_scores[word] = positive_scores.get(word, 0) + 1
        elif (word in sentence) and (label == 0):
            negative_scores[word] = negative_scores.get(word, 0) + 1

In [169]:
for word in vocab:
    if word not in positive_scores:
        positive_scores[word] = 0
    elif word not in negative_scores:
        negative_scores[word] = 0

In [170]:
print(positive_scores["hate"], negative_scores["hate"])  # Oh

0 3


In [171]:
X_inp = []
for i in range(X.shape[0]):
    positive_sum = 0
    negative_sum = 0
    for word in X[i]:
        positive_sum += positive_scores[word]
        negative_sum += negative_scores[word]
    X_inp.append([float(positive_sum), float(negative_sum)])
X_inp = np.array(X_inp)
print(X_inp.shape)

(300, 2)


In [172]:
X_inp[0], y[0]  # Well, kind of makes sense

(array([57., 17.]), 1)

In [173]:
model = LogisticRegression()  # It's such a weird feeling after all the DL complex stuff
model.fit(X_inp, y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [207]:
def predict(sentence):
    tokenized = tokenizer.tokenize(sentence)
    cleaned = [w for w in tokenized if w.lower() not in stopwords_set]
    stemmed = [ps.stem(w) for w in cleaned]
    positive_sum = 0
    negative_sum = 0
    for word in stemmed:
        if word not in positive_scores:
            positive_scores[word] = 0
        if word not in negative_scores:
            negative_scores[word] = 0
        positive_sum += positive_scores[word]
        negative_sum += negative_scores[word]

    prediction = model.predict(np.array([[positive_sum, negative_sum]]))
    print("Positive!" if prediction[0] == 1 else "NEGATIVE!")

In [209]:
predict("Hate HATE HATE FIGHT HATE!")

NEGATIVE!


In [210]:
predict("Love Love love!")

Positive!


In [212]:
predict("I hated everything, but now I love it!")

Positive!


# Nice! It works!
A W E S O M E ✨