Logistic Regression

In [119]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

In [120]:
data = pd.DataFrame([
    ("I’m so proud of finishing my project on time!", "positive"),
    ("The weather is amazing, it makes me feel alive.", "positive"),
    ("I received great feedback from my manager today.", "positive"),
    ("Hanging out with friends always lifts my mood.", "positive"),
    ("I love how peaceful it feels in the morning.", "positive"),
    ("That compliment really made my day better.", "positive"),
    ("I feel energized after my workout session.", "positive"),
    ("I’m thankful for the little things in life.", "positive"),
    ("Learning new skills excites me a lot.", "positive"),
    ("I’m smiling because everything feels right today.", "positive"),
    ("I enjoy helping people whenever I can.", "positive"),
    ("Watching my favorite show relaxes me completely.", "positive"),
    ("My team’s hard work finally paid off.", "positive"),
    ("I’m happy to see my family after so long.", "positive"),
    ("This book I’m reading is so inspiring.", "positive"),
    ("I’m frustrated because my laptop crashed again.", "negative"),
    ("I feel tired and drained after a long day.", "negative"),
    ("The traffic today made me so annoyed.", "negative"),
    ("I got a really disappointing grade in my exam.", "negative"),
    ("I feel lonely when no one checks on me.", "negative"),
    ("I’m upset because I argued with my best friend.", "negative"),
    ("My internet connection is so unreliable.", "negative"),
    ("It’s exhausting dealing with constant stress.", "negative"),
    ("I feel angry when people don’t listen to me.", "negative"),
    ("I regret missing an important opportunity.", "negative"),
    ("I’m disappointed that I didn’t get the promotion.", "negative"),
    ("My head hurts and I can’t focus properly.", "negative"),
    ("I feel anxious about tomorrow’s presentation.", "negative"),
    ("I’m sad because my vacation got cancelled.", "negative"),
    ("Everything feels overwhelming right now.", "negative")
], columns=['text', 'sentiment'])


In [121]:
data = data.sample(frac=1).reset_index(drop = True) #It shuffles the data from our dataset

In [122]:
X = data['text']
Y = data['sentiment']

Using Bag of Words

In [123]:
countvec = CountVectorizer()
countvec_fit = countvec.fit_transform(X)
bag_of_words = pd.DataFrame(countvec_fit.toarray(), columns = countvec.get_feature_names_out())
bag_of_words

Unnamed: 0,about,after,again,alive,always,amazing,an,and,angry,annoyed,...,unreliable,upset,vacation,watching,weather,when,whenever,with,work,workout
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


Using Tf-IDF

In [80]:
#tfidf_vec = TfidfVectorizer()
#tfidfvec_fit = tfidf_vec.fit_transform(X)
#tfidf_bag = pd.DataFrame(tfidfvec_fit.toarray(), columns = tfidf_vec.get_feature_names_out())
#tfidf_bag

In [82]:
X_train, X_test, Y_train, Y_test = train_test_split(bag_of_words, Y, test_size = 0.3, random_state = 7)
lr = LogisticRegression(random_state = 1).fit(X_train,Y_train)
Y_pred_lr = lr.predict(X_test)

In [83]:
accuracy_score(Y_pred_lr, Y_test)

0.1111111111111111

In [84]:
print(classification_report(Y_test, Y_pred_lr))

              precision    recall  f1-score   support

    negative       0.12      0.50      0.20         2
    positive       0.00      0.00      0.00         7

    accuracy                           0.11         9
   macro avg       0.06      0.25      0.10         9
weighted avg       0.03      0.11      0.04         9



Precision = correct positive predictions / all predicted positives.

Recall = correct positive predictions / all actual positives.

F1-score = harmonic mean of precision & recall.

Support = number of true samples for each class.

Naive Bayes - Slightly better then LR

In [111]:
from sklearn.naive_bayes import MultinomialNB

In [112]:
X_train, X_test, Y_train, Y_test = train_test_split(bag_of_words, Y, test_size = 0.3, random_state = 7)
nb = MultinomialNB().fit(X_train, Y_train)
Y_pred_nb = nb.predict(X_test)
accuracy_score(Y_pred_nb, Y_test)

0.2222222222222222

In [113]:
print(classification_report(Y_test, Y_pred_nb))

              precision    recall  f1-score   support

    negative       0.25      0.20      0.22         5
    positive       0.20      0.25      0.22         4

    accuracy                           0.22         9
   macro avg       0.23      0.23      0.22         9
weighted avg       0.23      0.22      0.22         9



Linear Support Vector Machine

In [124]:
from sklearn.linear_model import SGDClassifier
X_train, X_test, Y_train, Y_test = train_test_split(bag_of_words, Y, test_size = 0.3, random_state = 7)
svm = SGDClassifier().fit(X_train, Y_train)
Y_pred_svm = svm.predict(X_test)
accuracy_score(Y_pred_svm, Y_test)

0.3333333333333333

In [125]:
print(classification_report(Y_test, Y_pred_svm))

              precision    recall  f1-score   support

    negative       0.40      0.40      0.40         5
    positive       0.25      0.25      0.25         4

    accuracy                           0.33         9
   macro avg       0.33      0.33      0.33         9
weighted avg       0.33      0.33      0.33         9

