# DAISEE-Analysis: Exploratory Data Analysis, Model Training, and Evaluation

This notebook provides a comprehensive analysis of the DAISEE dataset. It covers the following steps:
1. Loading the preprocessed data.
2. Exploratory Data Analysis (EDA) and visualization on the processed data.
3. Model training and evaluation.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## 1. Load Preprocessed Data

In [None]:
train_df = pd.read_csv('data/train_processed.csv')
val_df = pd.read_csv('data/val_processed.csv')
test_df = pd.read_csv('data/test_processed.csv')

In [None]:
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']

X_val = val_df.drop('label', axis=1)
y_val = val_df['label']

X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

## 2. EDA and Visualization

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='label', data=train_df)
plt.title('Distribution of Engagement Levels in Training Data')
plt.show()

In [None]:
corr_matrix = X_train.corr()

plt.figure(figsize=(20, 15))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()

## 3. Model Training and Evaluation

In [None]:
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

In [None]:
y_pred_val = model.predict(X_val)
print('Validation Set Performance:
')
print(f'Accuracy: {accuracy_score(y_val, y_pred_val):.2f}')
print(classification_report(y_val, y_pred_val))

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_val, y_pred_val), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Validation Set')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
y_pred_test = model.predict(X_test)
print('Test Set Performance:
')
print(f'Accuracy: {accuracy_score(y_test, y_pred_test):.2f}')
print(classification_report(y_test, y_pred_test))

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_test), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Test Set')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()