### **Imports**

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

### **Load Data**

In [2]:
# Define paths relative to the notebook's location
train_path = '../data/processed/train_data.csv'
test_path = '../data/processed/test_data.csv'

# Load the datasets
try:
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: Data files not found.")
    print(f"Make sure '{train_path}' and '{test_path}' exist.")
    print("Have you run 'python src/preprocessing.py' first?")

# Prepare data and handle potential NaNs (from empty sentences)
X_train = train_df['processed_sentence'].fillna('')
y_train = train_df['sentiment']

X_test = test_df['processed_sentence'].fillna('')
y_test = test_df['sentiment']

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

Data loaded successfully.
Training samples: 2180
Test samples: 546


### **Feature Engineering (TF-IDF)**

In [3]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# 1. Fit the vectorizer ONLY on the training data
print("Fitting TF-IDF vectorizer on training data...")
vectorizer.fit(X_train)

# 2. Transform both the training and test data
print("Transforming train and test data...")
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"TF-IDF feature matrix shape (Train): {X_train_tfidf.shape}")

Fitting TF-IDF vectorizer on training data...
Transforming train and test data...
TF-IDF feature matrix shape (Train): (2180, 3571)


### **Train Baseline Model**

In [4]:
print("Training DummyClassifier...")

# Initialize the classifier
# 'stratified' predicts randomly, maintaining the training set's class distribution
dummy_model = DummyClassifier(strategy='stratified', random_state=42)

# Train the model
dummy_model.fit(X_train_tfidf, y_train)

print("Model training complete.")

Training DummyClassifier...
Model training complete.


### **Evaluate Baseline Model**

In [5]:
print("Evaluating model...")

# Make predictions on the test set
y_pred = dummy_model.predict(X_test_tfidf)

# Calculate individual metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\n--- Baseline Model Evaluation ---")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")

# Print a full classification report
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Negative (0)', 'Positive (1)']))

Evaluating model...

--- Baseline Model Evaluation ---
Accuracy:  0.5220
Precision: 0.5267
Recall:    0.5018
F1-Score:  0.5140

--- Classification Report ---
              precision    recall  f1-score   support

Negative (0)       0.52      0.54      0.53       271
Positive (1)       0.53      0.50      0.51       275

    accuracy                           0.52       546
   macro avg       0.52      0.52      0.52       546
weighted avg       0.52      0.52      0.52       546

