In [1]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
import numpy as np

2024-12-12 17:46:52.823286: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load data
data = pd.read_csv("../data_cleaned/train_data_cleaned.csv", header=None, names=["tweet", "sentiment"])

# Split data
train_tweets, test_tweets, train_labels, test_labels = train_test_split(
    data["tweet"].to_numpy()[1:], data["sentiment"].to_numpy()[1:].astype(int), test_size=0.1, random_state=21
)

print("Train data shape:", train_tweets.shape)
print("Test data shape:", test_tweets.shape)

print(train_tweets[0], train_labels[0])

# Adjust labels from -1, 0, 1 to 0, 1, 2
train_labels += 1
test_labels += 1

# Convert tweets to embeddings
#model = SentenceTransformer("all-MiniLM-L6-v2")
model = SentenceTransformer("all-mpnet-base-v2")
train_embeddings = model.encode(train_tweets)
test_embeddings = model.encode(test_tweets)
print(len(train_embeddings[0]), len(train_embeddings))

Train data shape: (10006,)
Test data shape: (1112,)
Obama can say what he wants but weve all seen what hes done in the past 4 years Actions speak louder than words HofstraDebate -1
768 10006


In [None]:
'''val_embeddings = test_embeddings[:len(test_tweets)//2]
val_labels = test_labels[:len(test_labels)//2]

test_embeddings = test_embeddings[len(test_tweets)//2:]
test_labels = test_labels[len(test_labels)//2:]''';

In [3]:
unique, counts = np.unique(train_labels, return_counts=True)
percentages = counts / len(train_labels) * 100
weights = []
for label, percentage in zip(unique, percentages):
    print(f"Label {label}: {percentage:.2f}%")
    weights.append(1 / percentage)

weights = np.array(weights)
weights /= weights.sum()
weights

Label 0: 43.34%
Label 1: 32.14%
Label 2: 24.52%


array([0.2429184 , 0.32759238, 0.42948923])

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Define the SVM and AdaBoost ensemble
svm = SVC(probability=True, kernel="rbf")  # Make sure to enable probability for boosting compatibility
boosting = AdaBoostClassifier(base_estimator=svm, n_estimators=10, random_state=42)

# Train and evaluate
boosting.fit(train_embeddings, train_labels)
y_pred = boosting.predict(test_embeddings)
report = classification_report(test_labels, y_pred, target_names=['class -1', 'class 0', 'class 1'])
print(report)



In [None]:
from sklearn.ensemble import BaggingClassifier

# Define the SVM and AdaBoost ensemble
svm = SVC(probability=True, kernel="rbf")  # Make sure to enable probability for boosting compatibility
boosting = BaggingClassifier(base_estimator=svm, n_estimators=10, random_state=42)

# Train and evaluate
boosting.fit(train_embeddings, train_labels)
y_pred = boosting.predict(test_embeddings)
report = classification_report(test_labels, y_pred, target_names=['class -1', 'class 0', 'class 1'])
print(report)

In [4]:
class_weights = {0: 1, 1: 1, 2: 1}
svm_model = SVC(kernel='rbf', class_weight=class_weights)
svm_model.fit(train_embeddings, train_labels)

In [5]:
svm_predictions = svm_model.predict(test_embeddings)
report = classification_report(test_labels, svm_predictions, target_names=['class -1', 'class 0', 'class 1'])
print(report)

              precision    recall  f1-score   support

    class -1       0.68      0.83      0.75       478
     class 0       0.64      0.51      0.57       359
     class 1       0.72      0.62      0.67       275

    accuracy                           0.68      1112
   macro avg       0.68      0.65      0.66      1112
weighted avg       0.67      0.68      0.67      1112



In [6]:
class_weights = {i: weights[i] for i in range(len(weights))}
svm_model = SVC(kernel='rbf', class_weight=class_weights)
svm_model.fit(train_embeddings, train_labels)

In [7]:
svm_predictions = svm_model.predict(test_embeddings)
report = classification_report(test_labels, svm_predictions, target_names=['class -1', 'class 0', 'class 1'])
print(report)

              precision    recall  f1-score   support

    class -1       0.71      0.72      0.72       478
     class 0       0.59      0.55      0.57       359
     class 1       0.62      0.67      0.65       275

    accuracy                           0.65      1112
   macro avg       0.64      0.65      0.64      1112
weighted avg       0.65      0.65      0.65      1112



In [None]:
N = 10
subsample_size = 0.3
all_predictions = np.zeros((test_labels.shape[0], )) 

for i in range(N):
    print(f"Training model {i+1}")
    sample_indices = np.random.choice(len(train_embeddings), int(subsample_size * len(train_embeddings)), replace=False)
    X_subsample, y_subsample = train_embeddings[sample_indices], train_labels[sample_indices]
    
    svm = SVC(probability=True, kernel="rbf")
    svm.fit(X_subsample, y_subsample)
    
    predictions = svm.predict_proba(test_embeddings)  
    all_predictions += predictions.argmax(axis=1).reshape((-1, ))
    print(all_predictions[0])

mean_predictions = np.round(all_predictions / N)
report = classification_report(test_labels, mean_predictions, target_names=['class -1', 'class 0', 'class 1'])
print(report)

In [None]:
'''combined_embeddings = np.concatenate((val_embeddings, test_embeddings), axis=0)
combined_labels = np.concatenate((val_labels, test_labels), axis=0)

svm_predictions = svm_model.predict(combined_embeddings)
report = classification_report(combined_labels, svm_predictions, target_names=['class -1', 'class 0', 'class 1'])
print(report)''';

              precision    recall  f1-score   support

    class -1       0.69      0.83      0.75       482
     class 0       0.64      0.53      0.58       366
     class 1       0.69      0.61      0.65       264

    accuracy                           0.68      1112
   macro avg       0.67      0.65      0.66      1112
weighted avg       0.67      0.68      0.67      1112



In [None]:
'''svm_predictions = svm_model.predict(val_embeddings)
report = classification_report(val_labels, svm_predictions, target_names=['class -1', 'class 0', 'class 1'])
print(report)''';

In [4]:
# Train K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=7)
knn_model.fit(train_embeddings, train_labels)
'''knn_predictions = knn_model.predict(val_embeddings)
knn_accuracy = accuracy_score(val_labels, knn_predictions)
print("KNN Val Accuracy:", knn_accuracy)'''

'knn_predictions = knn_model.predict(val_embeddings)\nknn_accuracy = accuracy_score(val_labels, knn_predictions)\nprint("KNN Val Accuracy:", knn_accuracy)'

In [5]:
knn_predictions = knn_model.predict(test_embeddings)
knn_accuracy = accuracy_score(test_labels, knn_predictions)
print("KNN Test Accuracy:", knn_accuracy)

report = classification_report(test_labels, knn_predictions, target_names=['class -1', 'class 0', 'class 1'])
print(report)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


KNN Test Accuracy: 0.6025179856115108
              precision    recall  f1-score   support

    class -1       0.62      0.77      0.69       478
     class 0       0.56      0.38      0.46       359
     class 1       0.59      0.60      0.60       275

    accuracy                           0.60      1112
   macro avg       0.59      0.58      0.58      1112
weighted avg       0.60      0.60      0.59      1112



In [6]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(train_embeddings, train_labels)
'''log_reg_predictions = log_reg_model.predict(val_embeddings)
log_reg_accuracy = accuracy_score(val_labels, log_reg_predictions)
print("Logistic Regression Val Accuracy:", log_reg_accuracy)'''

'log_reg_predictions = log_reg_model.predict(val_embeddings)\nlog_reg_accuracy = accuracy_score(val_labels, log_reg_predictions)\nprint("Logistic Regression Val Accuracy:", log_reg_accuracy)'

In [7]:
log_reg_predictions = log_reg_model.predict(test_embeddings)
log_reg_accuracy = accuracy_score(test_labels, log_reg_predictions)
print("Logistic Regression Test Accuracy:", log_reg_accuracy)
report = classification_report(test_labels, log_reg_predictions, target_names=['class -1', 'class 0', 'class 1'])
print(report)

Logistic Regression Test Accuracy: 0.6564748201438849
              precision    recall  f1-score   support

    class -1       0.67      0.79      0.72       478
     class 0       0.61      0.53      0.57       359
     class 1       0.68      0.60      0.63       275

    accuracy                           0.66      1112
   macro avg       0.65      0.64      0.64      1112
weighted avg       0.65      0.66      0.65      1112



In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_embeddings, train_labels)
'''rf_predictions = rf_model.predict(val_embeddings)
rf_accuracy = accuracy_score(val_labels, rf_predictions)
print("Random Forest Val Accuracy:", rf_accuracy)'''

'rf_predictions = rf_model.predict(val_embeddings)\nrf_accuracy = accuracy_score(val_labels, rf_predictions)\nprint("Random Forest Val Accuracy:", rf_accuracy)'

In [9]:
rf_predictions = rf_model.predict(test_embeddings)
rf_accuracy = accuracy_score(test_labels, rf_predictions)
print("Random Forest Test Accuracy:", rf_accuracy)
report = classification_report(test_labels, rf_predictions, target_names=['class -1', 'class 0', 'class 1'])
print(report)

Random Forest Test Accuracy: 0.6079136690647482
              precision    recall  f1-score   support

    class -1       0.60      0.85      0.70       478
     class 0       0.58      0.39      0.47       359
     class 1       0.69      0.48      0.56       275

    accuracy                           0.61      1112
   macro avg       0.62      0.57      0.58      1112
weighted avg       0.61      0.61      0.59      1112



In [10]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(train_embeddings, train_labels)
'''nb_predictions = nb_model.predict(val_embeddings)
nb_accuracy = accuracy_score(val_labels, nb_predictions)
print("Naive Bayes Val Accuracy:", nb_accuracy)'''

'nb_predictions = nb_model.predict(val_embeddings)\nnb_accuracy = accuracy_score(val_labels, nb_predictions)\nprint("Naive Bayes Val Accuracy:", nb_accuracy)'

In [11]:
nb_predictions = nb_model.predict(test_embeddings)
nb_accuracy = accuracy_score(test_labels, nb_predictions)
print("Naive Bayes Test Accuracy:", nb_accuracy)
report = classification_report(test_labels, nb_predictions, target_names=['class -1', 'class 0', 'class 1'])
print(report)

Naive Bayes Test Accuracy: 0.5665467625899281
              precision    recall  f1-score   support

    class -1       0.64      0.62      0.63       478
     class 0       0.50      0.43      0.46       359
     class 1       0.54      0.65      0.59       275

    accuracy                           0.57      1112
   macro avg       0.56      0.57      0.56      1112
weighted avg       0.57      0.57      0.56      1112

