# ML Model

## 1. Prepare Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the processed data
file_path = '../data/processed/processedata.csv'
df = pd.read_csv(file_path)

# Define features and target
features = ['Weekday', 'Is Weekend', 'Time of Day', 'During Exam Period', 'During Academic Period']
target = 'Activity Type'

# Separate features (X) and target (y)
X = df[features]
y = df[target]

# Encode categorical target variable
le_target = LabelEncoder()
y = le_target.fit_transform(y)  # Convert 'Activity Type' to numeric values (e.g., Academic = 1, Non-Academic = 0)

# Encode 'Time of Day' feature
X.loc[:, 'Time of Day'] = LabelEncoder().fit_transform(X['Time of Day'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output dataset sizes
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


## 2. Train the Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Apply SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print(f"Training set size after SMOTE: {X_resampled.shape}")

# Train the Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_resampled, y_resampled)

# Save the trained model for future steps if necessary
import joblib
joblib.dump(rf, "random_forest_model_with_smote.pkl")


## 3. Evaluate the Model

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_target.classes_))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=le_target.classes_, yticklabels=le_target.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
