<a href="https://colab.research.google.com/github/AnberAziz/BS-DS-Project/blob/main/Week6Anber.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files

print("--- Please upload your 'kaggle.json' file ---")
# This will prompt you to upload the kaggle.json file
files.upload()

--- Please upload your 'kaggle.json' file ---


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"anberaziz","key":"51b0632a958ba40eb6e93d29e8ea615c"}'}

In [2]:
# -----------------------------------------------------------------------
# Part 1: Environment Setup & Data Collection (from Week 1)
# -----------------------------------------------------------------------
print("--- Part 1: Setting up Kaggle and Downloading Dataset ---")

# 1. Install Kaggle library
!pip install kaggle scikit-learn -q

# 2. Set up the Kaggle API token
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
print("Kaggle API token installed.")

# 3. Download the dataset
!kaggle datasets download -d aljarah/xAPI-Edu-Data -q

# 4. Unzip the dataset (with bug fix from last week)
!unzip -oq xAPI-Edu-Data.zip
print("--- Setup Complete. Dataset 'xAPI-Edu-Data.csv' is ready. ---")

--- Part 1: Setting up Kaggle and Downloading Dataset ---
Kaggle API token installed.
Dataset URL: https://www.kaggle.com/datasets/aljarah/xAPI-Edu-Data
License(s): CC-BY-SA-4.0
--- Setup Complete. Dataset 'xAPI-Edu-Data.csv' is ready. ---


In [3]:
# -----------------------------------------------------------------------
# Part 2: Load Libraries and Clean Data (from Week 2)
# -----------------------------------------------------------------------
print("\n--- Part 2: Loading Libraries and Cleaning Data ---")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# -- NEW LIBRARIES FOR SKLEARN (Week 6) --
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

print("All libraries imported.")

# 3. Load and Clean Data
df = pd.read_csv('xAPI-Edu-Data.csv')
df_cleaned = df.copy()

# Rename columns
df_cleaned.rename(columns={
    'gender': 'gender', 'NationalITy': 'nationality', 'PlaceofBirth': 'place_of_birth',
    'StageID': 'stage_id', 'GradeID': 'grade_id', 'SectionID': 'section_id',
    'Topic': 'topic', 'Semester': 'semester', 'Relation': 'relation',
    'raisedhands': 'raised_hands', 'VisITedResources': 'visited_resources',
    'AnnouncementsView': 'announcements_view', 'Discussion': 'discussion',
    'ParentAnsweringSurvey': 'parent_answering_survey',
    'ParentschoolSatisfaction': 'parent_school_satisfaction',
    'StudentAbsenceDays': 'student_absence_days', 'Class': 'class_label'
}, inplace=True)

# Drop duplicates
df_cleaned = df_cleaned.drop_duplicates()
print(f"Removed {df_cleaned.duplicated().sum()} duplicate rows.")

# Encode binary features (from Week 2)
df_cleaned['student_absence_days'] = df_cleaned['student_absence_days'].map({'Under-7': 0, 'Above-7': 1})
df_cleaned['parent_answering_survey'] = df_cleaned['parent_answering_survey'].map({'Yes': 1, 'No': 0})
df_cleaned['parent_school_satisfaction'] = df_cleaned['parent_school_satisfaction'].map({'Good': 1, 'Bad': 0})
df_cleaned['gender'] = df_cleaned['gender'].map({'M': 1, 'F': 0})
print("Encoded binary features (absences, parent survey, gender).")


--- Part 2: Loading Libraries and Cleaning Data ---
All libraries imported.
Removed 0 duplicate rows.
Encoded binary features (absences, parent survey, gender).


In [4]:
# -----------------------------------------------------------------------
# Part 3: Week 6 - Feature Engineering for Classification
# -----------------------------------------------------------------------
print("\n--- Part 3: Week 6 Feature Engineering ---")

# We must convert ALL text columns to numbers for the model.
# We use One-Hot Encoding ('get_dummies') for this.
categorical_features = [
    'nationality', 'place_of_birth', 'stage_id', 'grade_id',
    'section_id', 'topic', 'semester', 'relation'
]

print(f"Original shape before One-Hot Encoding: {df_cleaned.shape}")
df_processed = pd.get_dummies(df_cleaned, columns=categorical_features, drop_first=True)
print(f"New shape after One-Hot Encoding: {df_processed.shape}")
print(f"Categorical features converted to {df_processed.shape[1] - df_cleaned.shape[1]} new numerical columns.")


--- Part 3: Week 6 Feature Engineering ---
Original shape before One-Hot Encoding: (478, 17)
New shape after One-Hot Encoding: (478, 61)
Categorical features converted to 44 new numerical columns.


In [5]:
# -----------------------------------------------------------------------
# Part 4: Week 6 - Train Baseline Classification Models
# -----------------------------------------------------------------------
print("\n--- Part 4: Week 6 Model Training & Comparison ---")

# 1. Define our features (X) and target (y)
# X is everything EXCEPT our target variable.
X = df_processed.drop('class_label', axis=1)
# y IS our target variable.
y = df_processed['class_label']

# 2. Split the data
# We use 'stratify=y' to ensure all classes (L, M, H) are
# represented proportionally in both train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Data split into {len(X_train)} training samples and {len(X_test)} testing samples.")

# 3. Initialize models (Class Task & Assignment 6)
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

# 4. Train and Evaluate
results = {}
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")

    # Train (fit) the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = accuracy

# --- Assignment 6: Compare Accuracy ---
print("\n--- Assignment 6: Model Accuracy Comparison ---")
print("This fulfills the Project Milestone: Baseline classification model built.")

# Sort results by accuracy
sorted_results = sorted(results.items(), key=lambda item: item[1], reverse=True)

for model_name, accuracy in sorted_results:
    print(f"  {model_name}: {accuracy * 100:.2f}% Accuracy")

best_model_name = sorted_results[0][0]
print(f"\n🏆 Best Baseline Model: {best_model_name}")
print("\n--- Week 6 Complete! ---")


--- Part 4: Week 6 Model Training & Comparison ---
Data split into 382 training samples and 96 testing samples.

Training Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Training Decision Tree...

Training Random Forest...

--- Assignment 6: Model Accuracy Comparison ---
This fulfills the Project Milestone: Baseline classification model built.
  Random Forest: 80.21% Accuracy
  Logistic Regression: 73.96% Accuracy
  Decision Tree: 68.75% Accuracy

🏆 Best Baseline Model: Random Forest

--- Week 6 Complete! ---
