In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.DataFrame([("i love spending time with my friends and family", "positive"),
("that was the best meal i've ever had in my life", "positive"),
("i feel so grateful for everything i have in my life", "positive"),
("i received a promotion at work and i couldn't be happier", "positive"),
("watching a beautiful sunset always fills me with joy", "positive"),
("my partner surprised me with a thoughtful gift and it made my day", "positive"),
("i am so proud of my daughter for graduating with honors", "positive"),
("listening to my favorite music always puts me in a good mood", "positive"),
("i love the feeling of accomplishment after completing a challenging task", "positive"),
("i am excited to go on vacation next week", "positive"),
("i feel so overwhelmed with work and responsibilities", "negative"),
("the traffic during my commute is always so frustrating", "negative"),
("i received a parking ticket and it ruined my day", "negative"),
("i got into an argument with my partner and we're not speaking", "negative"),
("i have a headache and i feel terrible", "negative"),
("i received a rejection letter for the job i really wanted", "negative"),
("my car broke down and it's going to be expensive to fix", "negative"),
("i'm feeling sad because i miss my friends who live far away", "negative"),
("i'm frustrated because i can't seem to make progress on my project", "negative"),
("i'm disappointed because my team lost the game", "negative")],
                    columns= ['text', 'sentiment'])

#### Shuffling our data and re-setting index

In [3]:
data = data.sample(frac = 1).reset_index(drop = True)

#### We now prepare the inputs for our algorithm

In [4]:
x = data['text']
y = data['sentiment']

#### Vectorizing our text
We use the BoW approach with the CountVectorizer function- converting text data into BoW

In [5]:
count_vec = CountVectorizer()

In [6]:
count_vec_fit = count_vec.fit_transform(x)

In [1]:
bag_of_words = pd.DataFrame(count_vec_fit.toarray(), columns = count_vec.get_feature_names_out())
bag_of_words

NameError: name 'pd' is not defined

#### Splitting our data into training and testing sets
We want to ensure we have the same train and test split each time we run the code hence the 'random_state()'

In [8]:
x_train, x_test, y_train, y_test = train_test_split(bag_of_words, y, test_size = 0.3, random_state = 7)

### Logistic Regression Model
We now train our model

In [9]:
lr = LogisticRegression(random_state = 1).fit(x_train, y_train)

#### We now generate prediction to see how the model generalizes with unseen data

In [10]:
y_pred_lr = lr.predict(x_test)

#### We now measure model performance- accuracy
We compare true labels with predicted label

In [11]:
accuracy_score(y_pred_lr, y_test)

0.16666666666666666

#### Classification report 
We want a more detailed information on the performance

In [12]:
print(classification_report(y_test, y_pred_lr, zero_division = 0))

              precision    recall  f1-score   support

    negative       0.17      1.00      0.29         1
    positive       0.00      0.00      0.00         5

    accuracy                           0.17         6
   macro avg       0.08      0.50      0.14         6
weighted avg       0.03      0.17      0.05         6



##### precision: out of all the sentences predicted by the model as + or -, which one was actually correct?, 
##### recall: out of all the sentences that are truly + or -, what proportion did the model correctly find?
##### f1-score: a number (from 0-1) that combines precision and recall into one measure.

### The Naive-Bayes classifier 
We want to improve the low accuracy score obtained. NB classifier works using probability 

In [13]:
from sklearn.naive_bayes import MultinomialNB

In [14]:
nb = MultinomialNB().fit(x_train, y_train)

In [15]:
y_pred_nb = nb.predict(x_test)

In [16]:
accuracy_score(y_pred_nb, y_test)

0.3333333333333333

In [17]:
print(classification_report(y_test, y_pred_nb, zero_division = 0))

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         1
    positive       0.67      0.40      0.50         5

    accuracy                           0.33         6
   macro avg       0.33      0.20      0.25         6
weighted avg       0.56      0.33      0.42         6



The Naive-Bayes model gives a better prediction but there's still room for improvement

### Linear Support Vector Machine
We want to improve the accuracy further. LSVM helps to find the best possible boundary that separates the boundaries (+&-)

In [18]:
from sklearn.linear_model import SGDClassifier

In [19]:
svm = SGDClassifier().fit(x_train, y_train)

In [20]:
y_pred_svm = svm.predict(x_test)

In [21]:
accuracy_score(y_pred_svm, y_test)

0.16666666666666666

In [22]:
print(classification_report(y_test, y_pred_svm, zero_division = 0))

              precision    recall  f1-score   support

    negative       0.17      1.00      0.29         1
    positive       0.00      0.00      0.00         5

    accuracy                           0.17         6
   macro avg       0.08      0.50      0.14         6
weighted avg       0.03      0.17      0.05         6



We haven't seen any much improvement either. It's either we clean our data further or add more