### Purpose: Set up environment, paths, and project overview

PROJECT OVERVIEW : Predicting Student Academic Performance using ML.
Goal: Build and evaluate models to predict student grades/performance.
Dataset: Student demographic, study habits, attendance, and grades. Eveyrthing will be done in object oriented programming (OOP).

##### 1. Import libraries

In [13]:
!pip install shap

Collecting numba>=0.54 (from shap)
  Downloading numba-0.62.1-cp312-cp312-win_amd64.whl.metadata (2.9 kB)
Collecting llvmlite<0.46,>=0.45.0dev0 (from numba>=0.54->shap)
  Downloading llvmlite-0.45.1-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Downloading numba-0.62.1-cp312-cp312-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.7 MB 325.1 kB/s eta 0:00:09
    --------------------------------------- 0.0/2.7 MB 393.8 kB/s eta 0:00:07
   --- ------------------------------------ 0.2/2.7 MB 1.4 MB/s eta 0:00:02
   ------------ --------------------------- 0.8/2.7 MB 4.1 MB/s eta 0:00:01
   -------------------- ------------------- 1.4/2.7 MB 5.7 MB/s eta 0:00:01
   -------------------------- ------------- 1.8/2.7 MB 6.2 MB/s eta 0:00:01
   ----------------------------------- ---- 2.5/2.7 MB 7.1 MB/s eta 0:00:01
   --------------------

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from ucimlrepo import fetch_ucirepo 
from IPython.display import Markdown , HTML
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,IsolationForest
from sklearn.svm import SVC
from joblib import dump, load
import shap
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report)
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_curve, auc
import warnings
warnings.filterwarnings('ignore')   

ModuleNotFoundError: No module named 'numba'

##### 2. Set random seed

In [6]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

##### 3. Define file paths

In [7]:
DATA_PATH = "../data/data.csv"
OUTPUT_PATH = "output/"
SAVE_PATH = os.path.join("outputs", "figures", "correlation_heatmap.png")

##### 4. Utilities Functions : 

In [8]:
def draw_histograms(df):
    """
        This method is use to draw histogram graphs.
    """
    SAVE_PATH = os.path.join("..","outputs", "figures", "histograms.png")
    cols = df.columns
    n_cols = 4  # number of plots per row
    n_rows = math.ceil(len(cols) / n_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 4 * n_rows))
    axes = axes.flatten()

    for i, col in enumerate(cols):
        sns.histplot(df[col], kde=True, ax=axes[i])
        axes[i].set_title(f'Histogram: {col}')

    # Remove empty axes if columns are fewer than grid cells
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()

    plt.savefig(SAVE_PATH, dpi=300, bbox_inches='tight')
    
    plt.show()


def boxplot(data):
    """
        This method is use to draw a boxplot graph.
    """
    boxplot=data.boxplot(figsize=(12,5))


def correcting_outliers(data, colname):
    """
        methods is use to deal with ouliers and returns the dataframe.it also takes the columname as argument
    """
    q1 = data[colname].quantile(0.25)
    q3 = data[colname].quantile(0.75)
    IQR = q3 - q1
    upper_limit = q3 + (1.5 * IQR)
    lower_limit = q1 - (1.5 * IQR)
    
    # Create a copy of the column to avoid modifying the original data
    corrected_column = data[colname].copy()
    
    # Replace values above the upper limit with the upper limit
    corrected_column.loc[corrected_column > upper_limit] = upper_limit
    
    # Replace values below the lower limit with the lower limit
    corrected_column.loc[corrected_column < lower_limit] = lower_limit
    
    # Update the data with the corrected column
    data[colname] = corrected_column
    
    return data


def find_outliers_iqr(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = (series < lower_bound) | (series > upper_bound)
    return outliers

def plot_histograms_and_box_plots(numeri_col):
    num_cols = len(numeri_col.columns)

    # Set up subplots
    fig, axes = plt.subplots(nrows=2, ncols=num_cols, figsize=(20, 8))
    fig.suptitle('Histograms and Box Plots')

    # Plot histograms and box plots
    for i, col in enumerate(numeri_col.columns):
        # Histogram
        sns.histplot(numeri_col[col], bins=20, kde=True, ax=axes[0, i])
        axes[0, i].set_title(f'{col} - Histogram')

        # Box plot
        sns.boxplot(x=numeri_col[col], ax=axes[1, i])
        axes[1, i].set_title(f'{col} - Box Plot')

    # Adjust layout
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()
                
def cap_outliers_iqr(data):
    if isinstance(data, pd.DataFrame):
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        data_capped = data.apply(lambda x: np.clip(x, lower_bound, upper_bound))
    elif isinstance(data, pd.Series):
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        data_capped = np.clip(data, lower_bound, upper_bound)
    else:
        raise ValueError("Input must be a DataFrame or a Series.")
    
    return data_capped

def check_inconsistent_values(df, expected_types=None, expected_ranges=None, expected_categories=None):
    """
    Checks for inconsistent values in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to check.
        expected_types (dict, optional): Dictionary mapping column names to expected data types (e.g., {'Age': int}).
        expected_ranges (dict, optional): Dictionary mapping column names to valid ranges (e.g., {'Age': (0, 100)}).
        expected_categories (dict, optional): Dictionary mapping column names to valid categories (e.g., {'Gender': [0, 1]}).
        
    Returns:
        inconsistent_report (dict): A dictionary with column names as keys and details of inconsistent values as values.
    """
    inconsistent_report = {}

    for col in df.columns:
        inconsistencies = []
        # Check type consistency
        if expected_types and col in expected_types:
            expected_type = expected_types[col]
            wrong_type = df[~df[col].apply(lambda x: isinstance(x, expected_type))]
            if not wrong_type.empty:
                inconsistencies.append(f"Type mismatch: {len(wrong_type)} values not of type {expected_type.__name__}")

        # Check range consistency
        if expected_ranges and col in expected_ranges:
            min_val, max_val = expected_ranges[col]
            out_of_range = df[(df[col] < min_val) | (df[col] > max_val)]
            if not out_of_range.empty:
                inconsistencies.append(f"Out of range: {len(out_of_range)} values not in [{min_val}, {max_val}]")

        # Check category consistency
        if expected_categories and col in expected_categories:
            valid_cats = expected_categories[col]
            invalid_cats = df[~df[col].isin(valid_cats)]
            if not invalid_cats.empty:
                inconsistencies.append(f"Invalid categories: {len(invalid_cats)} values not in {valid_cats}")

        if inconsistencies:
            inconsistent_report[col] = inconsistencies

    return inconsistent_report