In [1]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sentence_transformers import SentenceTransformer

2024-11-13 11:06:43.578142: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [34]:
# Load data
data = pd.read_csv("../data_cleaned/train_data_cleaned.csv", header=None, names=["tweet", "sentiment"])

# Split data
train_tweets, test_tweets, train_labels, test_labels = train_test_split(
    data["tweet"].to_numpy()[1:], data["sentiment"].to_numpy()[1:].astype(int), test_size=0.3, random_state=42
)

print("Train data shape:", train_tweets.shape)
print("Test data shape:", test_tweets.shape)

print(train_tweets[0], train_labels[0])

# Adjust labels from -1, 0, 1 to 0, 1, 2
train_labels += 1
test_labels += 1

# Convert tweets to embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
train_embeddings = model.encode(train_tweets)
test_embeddings = model.encode(test_tweets)
print(len(train_embeddings[0]), len(train_embeddings))

Train data shape: (7782,)
Test data shape: (3336,)
which means Big Bird is a 1er Obama supports the wealthy and privileged Who knew -1
384 7782


In [35]:
val_embeddings = test_embeddings[:len(test_tweets)//2]
val_labels = test_labels[:len(test_labels)//2]

test_embeddings = test_embeddings[len(test_tweets)//2:]
test_labels = test_labels[len(test_labels)//2:]

In [38]:
# Train SVM with RBF kernel
svm_model = SVC(kernel='rbf', C=1, gamma='scale')
svm_model.fit(train_embeddings, train_labels)
svm_predictions = svm_model.predict(val_embeddings)
svm_accuracy = accuracy_score(val_labels, svm_predictions)
print("SVM Val Accuracy:", svm_accuracy)

SVM Val Accuracy: 0.6474820143884892


In [39]:
svm_predictions = svm_model.predict(test_embeddings)
svm_accuracy = accuracy_score(test_labels, svm_predictions)
print("SVM Test Accuracy:", svm_accuracy)

SVM Test Accuracy: 0.644484412470024


In [23]:
# Train K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=7)
knn_model.fit(train_embeddings, train_labels)
knn_predictions = knn_model.predict(val_embeddings)
knn_accuracy = accuracy_score(val_labels, knn_predictions)
print("KNN Val Accuracy:", knn_accuracy)

KNN Val Accuracy: 0.5764388489208633


In [24]:
knn_predictions = knn_model.predict(test_embeddings)
knn_accuracy = accuracy_score(test_labels, knn_predictions)
print("KNN Test Accuracy:", knn_accuracy)

KNN Test Accuracy: 0.6052158273381295


In [27]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(train_embeddings, train_labels)
log_reg_predictions = log_reg_model.predict(val_embeddings)
log_reg_accuracy = accuracy_score(val_labels, log_reg_predictions)
print("Logistic Regression Val Accuracy:", log_reg_accuracy)

Logistic Regression Val Accuracy: 0.6196043165467626


In [28]:
log_reg_predictions = log_reg_model.predict(test_embeddings)
log_reg_accuracy = accuracy_score(test_labels, log_reg_predictions)
print("Logistic Regression Test Accuracy:", log_reg_accuracy)

Logistic Regression Test Accuracy: 0.6564748201438849


In [29]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_embeddings, train_labels)
rf_predictions = rf_model.predict(val_embeddings)
rf_accuracy = accuracy_score(val_labels, rf_predictions)
print("Random Forest Val Accuracy:", rf_accuracy)

Random Forest Val Accuracy: 0.5881294964028777


In [30]:
rf_predictions = rf_model.predict(test_embeddings)
rf_accuracy = accuracy_score(test_labels, rf_predictions)
print("Random Forest Test Accuracy:", rf_accuracy)

Random Forest Test Accuracy: 0.6034172661870504


In [32]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(train_embeddings, train_labels)
nb_predictions = nb_model.predict(val_embeddings)
nb_accuracy = accuracy_score(val_labels, nb_predictions)
print("Naive Bayes Val Accuracy:", nb_accuracy)

Naive Bayes Val Accuracy: 0.5053956834532374


In [33]:
nb_predictions = nb_model.predict(test_embeddings)
nb_accuracy = accuracy_score(test_labels, nb_predictions)
print("Naive Bayes Test Accuracy:", nb_accuracy)

Naive Bayes Test Accuracy: 0.5152877697841727
