# Loading Libraries

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from fastapi import FastAPI
from pydantic import BaseModel
from pyngrok import ngrok
import uvicorn
import nest_asyncio

# Loading Data

In [2]:
import pandas as pd


# OR if you know the full path
df = pd.read_csv('C:/Users/USER/OneDrive/Documents/nsbm/third year/final project/updated_df/cleaned sets/df1_cleaned.csv')

# View the first 5 rows
df.head()

Unnamed: 0,Hall_id,proj,computers,Date,Start_time,End_time,students
0,4,1.0,0.0,2023-08-02,13:00:00,15:00:00,550.0
1,4,1.0,0.0,2023-08-02,15:00:00,17:00:00,550.0
2,4,1.0,0.0,2023-08-03,09:00:00,12:00:00,550.0
3,4,1.0,0.0,2023-08-03,12:00:00,14:00:00,550.0
4,4,1.0,0.0,2023-08-03,14:00:00,17:00:00,550.0


In [3]:
# Assuming df is your dataframe
df['Hall_id'] = df['Hall_id'].astype(str)  # Make sure all are strings

# Apply LabelEncoder
le = LabelEncoder()
df['Hall_id_encoded'] = le.fit_transform(df['Hall_id'])

# Get Mapping
mapping = dict(zip(le.classes_, le.transform(le.classes_)))

# Display mapping
for original, encoded in mapping.items():
    print(f"Original: {original} --> Encoded: {encoded}")

Original: 105 --> Encoded: 0
Original: 106 --> Encoded: 1
Original: 2 --> Encoded: 2
Original: 3 --> Encoded: 3
Original: 4 --> Encoded: 4
Original: 7 --> Encoded: 5
Original: 8 --> Encoded: 6
Original: 9 --> Encoded: 7
Original: L101 --> Encoded: 8
Original: L102 --> Encoded: 9
Original: L103 --> Encoded: 10
Original: L104 --> Encoded: 11
Original: L105 --> Encoded: 12
Original: L106 --> Encoded: 13
Original: L107 --> Encoded: 14
Original: L110 --> Encoded: 15
Original: L204 --> Encoded: 16
Original: L205 --> Encoded: 17


In [4]:
# 4. Drop old Hall_id
df = df.drop('Hall_id', axis=1)

# 4. Rename Hall_id_encoded back to Hall_id
df = df.rename(columns={'Hall_id_encoded': 'Hall_id'})
df.head()

Unnamed: 0,proj,computers,Date,Start_time,End_time,students,Hall_id
0,1.0,0.0,2023-08-02,13:00:00,15:00:00,550.0,4
1,1.0,0.0,2023-08-02,15:00:00,17:00:00,550.0,4
2,1.0,0.0,2023-08-03,09:00:00,12:00:00,550.0,4
3,1.0,0.0,2023-08-03,12:00:00,14:00:00,550.0,4
4,1.0,0.0,2023-08-03,14:00:00,17:00:00,550.0,4


In [5]:
df = df[['Hall_id', 'proj', 'computers', 'Date', 'Start_time', 'End_time', 'students']]
df.head()

Unnamed: 0,Hall_id,proj,computers,Date,Start_time,End_time,students
0,4,1.0,0.0,2023-08-02,13:00:00,15:00:00,550.0
1,4,1.0,0.0,2023-08-02,15:00:00,17:00:00,550.0
2,4,1.0,0.0,2023-08-03,09:00:00,12:00:00,550.0
3,4,1.0,0.0,2023-08-03,12:00:00,14:00:00,550.0
4,4,1.0,0.0,2023-08-03,14:00:00,17:00:00,550.0


# Feature Engineering

In [6]:
#Transform Date and Time columns
# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [7]:
# Extract useful features from Date
df['month'] = df['Date'].dt.month
df['day_of_week'] = df['Date'].dt.dayofweek
df['day_of_month'] = df['Date'].dt.day

In [8]:
# Month Cyclic Encoding
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

In [9]:
# Day of Month Cyclic Encoding
df['days_in_month'] = df['Date'].dt.days_in_month
df['day_sin'] = np.sin(2 * np.pi * df['day_of_month'] / df['days_in_month'])
df['day_cos'] = np.cos(2 * np.pi * df['day_of_month'] / df['days_in_month'])

In [10]:
# Convert Start_time and End_time to datetime.time
df['Start_time'] = pd.to_datetime(df['Start_time'], format='%H:%M:%S', errors='coerce')
df['End_time'] = pd.to_datetime(df['End_time'], format='%H:%M:%S', errors='coerce')

In [11]:
# Extract hour from Start_time and End_time
df['start_hour'] = df['Start_time'].dt.hour
df['end_hour'] = df['End_time'].dt.hour

# Session Duration (in minutes)
df['duration_minutes'] = (df['End_time'] - df['Start_time']).dt.total_seconds() / 60

In [12]:
# Hour Cyclic Encoding
df['hour_sin'] = np.sin(2 * np.pi * df['start_hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['start_hour'] / 24)

In [13]:
#Drop unwanted columns
drop_cols = ['capacity', 'Date', 'Start_time', 'End_time','days_in_month']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

In [14]:
df['is_morning'] = (df['start_hour'] < 12).astype(int)
df['is_afternoon'] = (df['start_hour'] >= 12).astype(int)

In [15]:
# Categorize
df['session_length'] = pd.cut(
    df['duration_minutes'],
    bins=[0, 60, 120, 240],
    labels=['Short', 'Medium', 'Long']
)

In [16]:
# One-hot encode session length
session_dummies = pd.get_dummies(df['session_length'], prefix='session')

# Concatenate with original dataframe
df = pd.concat([df, session_dummies], axis=1)

# Drop original session_length
df = df.drop('session_length', axis=1)

In [17]:
# Convert "True"/"False" strings to 1/0
boolean_columns = ['session_Short', 'session_Medium', 'session_Long']

for col in boolean_columns:
    df[col] = df[col].astype(int)

In [18]:
df.head()

Unnamed: 0,Hall_id,proj,computers,students,month,day_of_week,day_of_month,month_sin,month_cos,day_sin,...,start_hour,end_hour,duration_minutes,hour_sin,hour_cos,is_morning,is_afternoon,session_Short,session_Medium,session_Long
0,4,1.0,0.0,550.0,8,2,2,-0.866025,-0.5,0.394356,...,13,15,120.0,-0.258819,-0.965926,0,1,0,1,0
1,4,1.0,0.0,550.0,8,2,2,-0.866025,-0.5,0.394356,...,15,17,120.0,-0.7071068,-0.707107,0,1,0,1,0
2,4,1.0,0.0,550.0,8,3,3,-0.866025,-0.5,0.571268,...,9,12,180.0,0.7071068,-0.707107,1,0,0,0,1
3,4,1.0,0.0,550.0,8,3,3,-0.866025,-0.5,0.571268,...,12,14,120.0,1.224647e-16,-1.0,0,1,0,1,0
4,4,1.0,0.0,550.0,8,3,3,-0.866025,-0.5,0.571268,...,14,17,180.0,-0.5,-0.866025,0,1,0,0,1


In [19]:
# Define X and y again
X = df.drop('Hall_id', axis=1)
y = df['Hall_id']

In [20]:
# Then continue
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [21]:
classes = np.unique(y)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
class_weight_dict = dict(zip(classes, class_weights))

# Map weights to each sample
sample_weights_train = y_train.map(class_weight_dict)

In [22]:
# 2. Safe XGBClassifier class (fixes 'feature_weights' error)
class SafeXGBClassifier(XGBClassifier):
    def get_params(self, deep=True):
        params = super().get_params(deep)
        params.pop("feature_weights", None)  # Remove problematic attribute
        return params

In [23]:
xgb_model = SafeXGBClassifier(
    use_label_encoder=False,
    eval_metric="mlogloss"
)

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 300, 500, 700],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 2, 3],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [1, 1.5, 2, 3]
}

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,
    scoring='accuracy',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [24]:
random_search.fit(X_train, y_train, sample_weight=sample_weights_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [25]:
best_model = random_search.best_estimator_

In [26]:
best_model.fit(X_train, y_train, sample_weight=sample_weights_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [27]:
# Predict
y_val_pred = best_model.predict(X_val)

# Accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

# Detailed Report
print(classification_report(y_val, y_val_pred))

Validation Accuracy: 79.23%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        66
           1       1.00      1.00      1.00        28
           2       1.00      1.00      1.00       174
           3       1.00      1.00      1.00        81
           4       1.00      1.00      1.00        98
           5       0.50      0.91      0.65        33
           6       1.00      0.63      0.78        52
           7       1.00      1.00      1.00       181
           8       0.64      0.58      0.61       145
           9       0.58      0.63      0.60       131
          10       1.00      1.00      1.00        40
          11       1.00      1.00      1.00        13
          12       1.00      1.00      1.00         1
          13       1.00      0.07      0.13        42
          14       1.00      0.44      0.61        79
          15       1.00      1.00      1.00         3
          16       0.16      0.10      0.12        39

In [28]:
from sklearn.pipeline import Pipeline, FunctionTransformer

# Define a custom function to do all your feature engineering
def feature_engineering_fn(df):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df['month'] = df['Date'].dt.month
    df['day_of_week'] = df['Date'].dt.dayofweek
    df['day_of_month'] = df['Date'].dt.day
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['days_in_month'] = df['Date'].dt.days_in_month
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_month'] / df['days_in_month'])
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_month'] / df['days_in_month'])

    df['Start_time'] = pd.to_datetime(df['Start_time'], format='%H:%M:%S', errors='coerce')
    df['End_time'] = pd.to_datetime(df['End_time'], format='%H:%M:%S', errors='coerce')
    df['start_hour'] = df['Start_time'].dt.hour
    df['end_hour'] = df['End_time'].dt.hour
    df['duration_minutes'] = (df['End_time'] - df['Start_time']).dt.total_seconds() / 60
    df['hour_sin'] = np.sin(2 * np.pi * df['start_hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['start_hour'] / 24)

    df['is_morning'] = (df['start_hour'] < 12).astype(int)
    df['is_afternoon'] = (df['start_hour'] >= 12).astype(int)

    df['session_length'] = pd.cut(df['duration_minutes'], bins=[0, 60, 120, 240], labels=['Short', 'Medium', 'Long'])
    session_dummies = pd.get_dummies(df['session_length'], prefix='session')
    df = pd.concat([df, session_dummies], axis=1)
    for col in ['session_Short', 'session_Medium', 'session_Long']:
        if col not in df.columns:
            df[col] = 0

    df = df.drop(columns=['Date', 'Start_time', 'End_time', 'days_in_month', 'session_length'])
    return df

feature_engineering = FunctionTransformer(feature_engineering_fn)

In [29]:
class FullModelWrapper:
    def __init__(self, pipeline, label_encoder):
        self.pipeline = pipeline
        self.label_encoder = label_encoder

    def predict(self, X):
        # Predict encoded labels
        encoded_preds = self.pipeline.predict(X)
        # Decode them
        decoded_preds = self.label_encoder.inverse_transform(encoded_preds)
        return decoded_preds

    def predict_proba(self, X):
        # Get access to the inner model inside the pipeline
        probs = self.pipeline.named_steps['model'].predict_proba(
            self.pipeline.named_steps['feature_engineering'].transform(X)
        )
        return probs

    @property
    def classes_(self):
        return self.label_encoder.classes_

In [30]:
# Create pipeline with feature engineering and model
pipeline = Pipeline([
    ('feature_engineering', feature_engineering),
    ('model', best_model)
])

# Wrap pipeline + label encoder into one object
full_model = FullModelWrapper(pipeline, le)

# Save everything into one file
import joblib
joblib.dump(full_model, 'hall_id_full_model.pkl')

['hall_id_full_model.pkl']

In [31]:
import json
import joblib
import pandas as pd

# Load your saved model with feature engineering and label decoding
model = joblib.load('hall_id_full_model.pkl')

def lambda_handler(event, context):
    try:
        # Parse the incoming request body (JSON format)
        body = json.loads(event['body'])

        # Convert JSON input to DataFrame
        input_df = pd.DataFrame([body])

        # Predict using the full pipeline
        prediction = model.predict(input_df)[0]

        return {
            'statusCode': 200,
            'headers': {"Access-Control-Allow-Origin": "*"},  # CORS
            'body': json.dumps({'prediction': prediction})
        }

    except Exception as e:
        return {
            'statusCode': 500,
            'headers': {"Access-Control-Allow-Origin": "*"},
            'body': json.dumps({'error': str(e)})
        }

In [37]:
import joblib
joblib.dump(full_model, 'hall_id_full_model.pkl')

['hall_id_full_model.pkl']

In [38]:
import joblib
import os
from IPython.display import FileLink

# Save the model (if not saved yet)
joblib.dump(full_model, 'hall_id_full_model.pkl')

# Create a clickable download link
FileLink('hall_id_full_model.pkl')