In [1]:
# Building Text Classifiers
# - Using custom labels to classify text data. 
# - Type of supervised learning algorithm. 
# - Algorithms we'll use: Logistic Regression, Naive Bayes, Linear SVC


# Logistic Regression
# - Goal is to predict class
# - Good starting point for text classification.

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report   

data = pd.DataFrame([("i love spending time with my friends and family", "positive"),
("that was the best meal i've ever had in my life", "positive"),
("i feel so grateful for everything i have in my life", "positive"),
("i received a promotion at work and i couldn't be happier", "positive"),
("watching a beautiful sunset always fills me with joy", "positive"),
("my partner surprised me with a thoughtful gift and it made my day", "positive"),
("i am so proud of my daughter for graduating with honors", "positive"),
("listening to my favorite music always puts me in a good mood", "positive"),
("i love the feeling of accomplishment after completing a challenging task", "positive"),
("i am excited to go on vacation next week", "positive"),
("i feel so overwhelmed with work and responsibilities", "negative"),
("the traffic during my commute is always so frustrating", "negative"),
("i received a parking ticket and it ruined my day", "negative"),
("i got into an argument with my partner and we're not speaking", "negative"),
("i have a headache and i feel terrible", "negative"),
("i received a rejection letter for the job i really wanted", "negative"),
("my car broke down and it's going to be expensive to fix", "negative"),
("i'm feeling sad because i miss my friends who live far away", "negative"),
("i'm frustrated because i can't seem to make progress on my project", "negative"),
("i'm disappointed because my team lost the game", "negative")],
columns=["text", "sentiment"])

# Goal - build an algorithm that can learn from this data and predict the sentiment of new text. 


In [3]:
# Shuffle the data - frac = 1 means 100% of the data, reset index means reset the index of the dataframe
data = data.sample(frac=1).reset_index(drop=True) 


In [19]:
X = data["text"]
y = data["sentiment"]

count_vec = CountVectorizer()

In [44]:
count_vec_fit = count_vec.fit_transform(X)
bag_of_words = pd.DataFrame(count_vec_fit.toarray(), columns=count_vec.get_feature_names_out())

In [45]:
print(bag_of_words)

    accomplishment  after  always  am  an  and  argument  at  away  be  ...  \
0                0      0       0   0   0    1         0   0     0   0  ...   
1                0      0       0   1   0    0         0   0     0   0  ...   
2                0      0       0   0   0    1         0   0     0   0  ...   
3                0      0       0   0   0    0         0   0     0   0  ...   
4                0      0       1   0   0    0         0   0     0   0  ...   
5                0      0       0   0   0    0         0   0     1   0  ...   
6                0      0       1   0   0    0         0   0     0   0  ...   
7                0      0       0   0   0    0         0   0     0   0  ...   
8                0      0       0   0   0    0         0   0     0   0  ...   
9                0      0       1   0   0    0         0   0     0   0  ...   
10               0      0       0   1   0    0         0   0     0   0  ...   
11               0      0       0   0   0    1      

In [46]:
# Spilt the data into training and testing sets
# - Training set is used to train the model
# - Testing set is used to evaluate the model

# Features : input data (x) = bag of words sentences
# Labels : output data (y) = sentiment

X_train, X_test, y_train, y_test = train_test_split(
    bag_of_words, 
    y, 
    test_size=0.25, # 30% of the data is used for testing
    random_state=8) # random state is used to ensure the same split every time the code is run

lr = LogisticRegression(random_state=1).fit(X_train, y_train) # fit the model to the training data

y_pred_lr = lr.predict(X_test) # predict the sentiment of the testing data

In [47]:
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {accuracy_lr:.2f}")

Logistic Regression Accuracy: 0.40


In [48]:
print(classification_report(y_test, y_pred_lr, zero_division=0))

              precision    recall  f1-score   support

    negative       0.50      0.33      0.40         3
    positive       0.33      0.50      0.40         2

    accuracy                           0.40         5
   macro avg       0.42      0.42      0.40         5
weighted avg       0.43      0.40      0.40         5



In [49]:
# Precision : out of all the positive predictions, how many were correct. 
# Recall : out of all the actual positive cases, how many did the model predict correctly. 
# F1 Score : harmonic mean of precision and recall. 

In [50]:
# Improving the accuracy of the model using Naive Bayes
# - Naive Bayes is a probabilistic classifier that uses Bayes' theorem to classify data. 
# - Naive Bayes combines word counts mathematically to calculate the probability of a class. 
# - It's called "naive" because it assumes that the words in the text are independent of each other.
 

In [51]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB() # Initialize the model
nb.fit(X_train, y_train) # Fit the model to the training data


In [52]:
y_pred_nb = nb.predict(X_test) # Predict the sentiment of the testing data

In [53]:
accuracy_score(y_pred_nb, y_test)

0.4

In [54]:
# Linear SVC
# - Finds the best possible boundary between classes. 
# - Uses support vectors to find the best boundary. 
# - For two data points, the boundary is a line. 
# - For three data points, the boundary is a plane. 
# - For more data points, the boundary is a hyperplane. 

In [55]:
from sklearn.linear_model import SGDClassifier 
svm = SGDClassifier().fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
accuracy_score(y_pred_svm, y_test)



0.2

In [56]:
# The result from the model aren't good. In order to improve the accuracy we can:
# - Revist the data 
# - Clean the data further
# - Add more data