In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, hinge_loss
from google.colab import drive

# Mount Google Drive
drive.mount('/content/gdrive')

absolute_path = "/content/gdrive/My Drive/Projects/Financial-Sentiment/"
dataset_path = absolute_path + "Datasets/"

# Load datasets
train_df = pd.read_csv(dataset_path + "train_set.csv")
val_df = pd.read_csv(dataset_path + "validation_set.csv")
test_df = pd.read_csv(dataset_path + "test_set.csv")

# Extract features and labels
X_train, y_train = train_df["Sentence"], train_df["SentimentNumerical"]
X_val, y_val = val_df["Sentence"], val_df["SentimentNumerical"]
X_test, y_test = test_df["Sentence"], test_df["SentimentNumerical"]

# Define SVM pipeline
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("svm", SVC(kernel="linear", probability=True, decision_function_shape="ovr"))
])

# Define hyperparameters for tuning
param_grid = {
    "tfidf__max_features": [5000, 10000],  # Feature selection
    "tfidf__ngram_range": [(1, 1), (1, 2)],  # Unigrams and bigrams
    "tfidf__stop_words": [None, "english"],  # Stopword removal
    "svm__C": [0.1, 1, 10],  # Regularization strength
}

# Track total training time
start_training_time = time.time()

# Perform Grid Search with Cross Validation
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring="accuracy", verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# End training time tracking
end_training_time = time.time()
total_training_time = end_training_time - start_training_time

# Print best parameters
print("Best Hyperparameters:", grid_search.best_params_)
print(f"Total Training Time: {total_training_time:.2f} seconds")

# Train final model using best parameters
best_model = grid_search.best_estimator_

# Compute Training and Validation Loss (Hinge Loss)
train_decision_scores = best_model.decision_function(X_train)  # Shape: (num_samples, num_classes)
val_decision_scores = best_model.decision_function(X_val)  # Shape: (num_samples, num_classes)

# Convert labels to -1 and 1 (required for hinge loss)
y_train_binary = np.where(y_train == 0, -1, 1)
y_val_binary = np.where(y_val == 0, -1, 1)

# Select the decision function corresponding to the correct class
train_decision_values = np.choose(y_train, train_decision_scores.T)
val_decision_values = np.choose(y_val, val_decision_scores.T)

# Compute Hinge Loss
train_loss = hinge_loss(y_train_binary, train_decision_values)
val_loss = hinge_loss(y_val_binary, val_decision_values)

print(f"Training Loss (Hinge Loss): {train_loss:.4f}")
print(f"Validation Loss (Hinge Loss): {val_loss:.4f}")

# Validate the model
y_val_pred = best_model.predict(X_val)
print("Validation Performance:\n", classification_report(y_val, y_val_pred))

# Measure prediction time for each sample
svm_predictions = []
prediction_times = []

for sentence in X_test:
    start_time = time.time()
    prediction = best_model.predict([sentence])[0]  # Predict sentiment
    end_time = time.time()

    elapsed_time = end_time - start_time  # Calculate time taken

    svm_predictions.append(prediction)
    prediction_times.append(elapsed_time)

# Store predictions and time taken in the test DataFrame
test_df["svm_predictions"] = svm_predictions
test_df["time_svm"] = prediction_times

# Save the updated test set with predictions and time
test_df.to_csv(dataset_path + "test_set_with_predictions.csv", index=False)

# Display the first few rows of the updated test set
print(test_df.head())

Mounted at /content/gdrive
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Hyperparameters: {'svm__C': 1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': None}
Total Training Time: 89.62 seconds
Training Loss (Hinge Loss): 1.0607
Validation Loss (Hinge Loss): 0.9988
Validation Performance:
               precision    recall  f1-score   support

           0       0.73      0.67      0.70       172
           1       0.66      0.69      0.67       172
           2       0.66      0.69      0.68       172

    accuracy                           0.68       516
   macro avg       0.69      0.68      0.68       516
weighted avg       0.69      0.68      0.68       516

     id                                           Sentence Sentiment  \
0   815  mr mika korpinen apac vice president for compt...   neutral   
1  1876  so far mr galvan he has been able to avoid lay...  positive   
2  1654  raute reported a loss per share of eur0 86 for...  