# Import the necessary libraries

In [None]:
# import neccessay libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from IPython.display import display, Markdown, Latex
from pathlib import Path
from joblib import dump, load
warnings.filterwarnings('ignore')

We have just imported python libraries that will be necessary for this task

## Loading the dataset

In [None]:
# to read the dataset data
df = pd.read_csv('Student_Data.csv')

# to view the dataframe
df

Here, we loaded our dataset and view the data frame

## Data Preprocessing

In [None]:
# Import label encoder 
from sklearn import preprocessing 
  
# label_encoder object knows  
# how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
df['Target']= label_encoder.fit_transform(df['Target']) 
  
df

We applied LabelEncoder which is a practical tool for preprocessing categorical data for machine learning models, ensuring the data is in a suitable numeric format for processing.








## Checking the details of the Dataset

In [None]:
# list of numerical data types to look out for
numerics_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

# list of categorical data types to look out for
categorical_dtypes = ['object', 'category', 'bool']

# get columns with numerical values
has_numerical = [e for e in df.columns if df[e].dtype.name in numerics_dtypes]

# get columns with categorical values
has_categorical = [e for e in df.columns if df[e].dtype.name in categorical_dtypes]

# print out the output
print('Columns with numerical values:', ', '.join(has_numerical))
print('\nColumns with categorical values:', ', '.join(has_categorical))

# Exploratory Data Analysis (EDA)

In [None]:
# checking the dimension of the dataset
display(Markdown(f'This dataset contains {df.shape[0]} rows and {df.shape[1]} columns. Out of which, 1 is the target variable and the remaining are the independent variables.'))

In [None]:
# checking the columns of the dataset
df.columns.tolist()

Here, we try to have a proper view of the columns in our dataset

In [None]:
# to check the data type of the columns
df.dtypes

In [None]:
data_types = ', '.join(str(e) for e in df.dtypes.unique().tolist())
display(Markdown(f'There are {len(df.dtypes.unique().tolist())} different types of data ({data_types}) present in the dataset.'))

## Checking Null or Missing Values in the Dataset

In [None]:
# to check for null values
df.isnull().sum()

The above result indicates there is no missing value in our dataset. Let visualize this through the heatmap below

### Visualizing the null/missing values using heatmap

In [None]:
# to visualize the null values using heatmap
sns.heatmap(df.isnull(), cmap = "cool_r")

In [None]:
null_text = f'And we can clearly visualize that there are {sum(df.isnull().sum().tolist())} missing data present.' if sum(df.isnull().sum().tolist()) > 0 else 'And we can clearly visualize that there is no missing data present.'
display(Markdown(null_text))

In [None]:
df.info()

The above function (df.info()), gives a brief information about the dataset which includes indexing type, column type, null values and memory usage.



## Lets see how many unique values present in each column of the dataset.

In [None]:
# to get the total number of unique values and thier data type present in each column
unique_values = []
for column in df.columns.tolist():
    unique_values.append([column, df[column].nunique(), df[column].dtype.name])
    
unique_df = pd.DataFrame(unique_values, columns=[['Column Name', 'Number of Unique Value', 'Column Data Type']])
unique_df

The above table shows the number of unique values present in each column in our dataset

## Lets check the Target Variable

In [None]:
# checking the list of the target variable
target_values = df["Target"].unique().tolist()
target_values

There are three (3) Categories in our Target variable column. These are Dropout = 0, Enrolled = 1, and Graduate = 2.

In [None]:
display(Markdown(f"These are {len(target_values)} categories present in the target column namely {' and '.join([str(e) for e in target_values])}."))

In [None]:
df["Target"].value_counts()

this is or target variable count showing class imbalance as can be seen in their total count above.

## Dataset Description

In [None]:
# declared an empty list
skewed_right = []
skewed_left = []
found_outliers = []
outliers_columns = []


def find_outliers(column):
    global found_outliers, outliers_columns
    # finding the 1st quartile
    q1 = df[column].quantile(0.25)

    # finding the 3rd quartile
    q3 = df[column].quantile(0.75)

    # finding the iqr region
    iqr = q3 - q1

    # finding upper and lower whiskers
    upper_bound = q3 + (1.5 * iqr)
    lower_bound = q1 - (1.5 * iqr)

    # Get the array data for column
    arr1 = df[column]

    # Get the outliers using the upper and lower whiskers
    outliers = arr1[(arr1 <= lower_bound) | (arr1 >= upper_bound)]
    
    if len(outliers.values) > 0:
        # append the found oultiers and the column name
        found_outliers.append(outliers.tolist())
        outliers_columns.append(column)
        
    return
            
def summary_table():
    for column in df.columns:
        # calculate the mean value
        mean_value = df[column].mean(axis=0)
    
        # calculate the median value
        median_value = df[column].median(axis=0)
    
        # check if colmun is skewed to the right
        if mean_value > median_value:
            skewed_right.append(column)
    
        # check if colmun is skewed to the left
        elif mean_value < median_value:
            skewed_left.append(column)
        
        # call the function to check for outliers
        find_outliers(column)
    
    # store the summary description text
    result_str = "This gives the statistical information of the numerical columns. The summary of the dataset looks perfect since there is no negative/invalid values present." + "\n" + \
    "1. The counts of all the columns are the same which means there are no missing values in the dataset." + "\n" + \
    f"2. The mean value is greater than median (50%) in '{', '.join([str(e) for e in skewed_right])}' columns which means the data is skewed to right in these column." + "\n" + \
    f"3. The data in the '{', '.join([str(e) for e in skewed_left])}' columns have mean value less tha median value which means the data is skewed to the left."
    
    # check if there were any outlier found and add text to description text
    if len(found_outliers[0]) > 0:
        result_str += "\n" + f"4. By summarizing the data we can observe that there is a huge difference between 75% and max in '{', '.join([str(e) for e in outliers_columns])}' columns hence there are outliers present in the data."
    
    else:
        result_str += "\n" + '4. By summarizing the data we can observe that there is no huge difference between 75% and max hence there are no outliers present in the data.'
    
    # add the final text to description text
    result_str += "\n" + "5. We can also notice the standard deviation, min, 25% percentile values from this described method."
    
    return result_str

In [None]:
# statistical summary of numerical columns
df.describe()

In [None]:
# display the summary text using markdown
display(Markdown(summary_table()))

lets visualize this here

In [None]:
# to check the distribution of the remaining columns 
plot_number = 1

num_plots = len(has_numerical) - 1 if len(has_numerical)%2 != 0 else len(has_numerical)

plt.figure(figsize = (num_plots + 4, num_plots * 2), facecolor = "white")

outliers_columns = []

for column in has_numerical:
    # finding the 1st quartile
    q1 = df[column].quantile(0.25)

    # finding the 3rd quartile
    q3 = df[column].quantile(0.75)

    # finding the iqr region
    iqr = q3 - q1

    # finding upper and lower whiskers
    upper_bound = q3 + (1.5 * iqr)
    lower_bound = q1 - (1.5 * iqr)

    # Get the array data for column
    arr1 = df[column]

    # Get the outliers using the upper and lower whiskers
    outliers = arr1[(arr1 <= lower_bound) | (arr1 >= upper_bound)]
    
    if len(outliers.values) > 0:
        # append the found oultiers and the column name
        found_outliers.append(outliers.tolist())
        outliers_columns.append(column)
        
    # plot the boxplots
    if plot_number <= num_plots:
        ax = plt.subplot(int(num_plots/2), 2, plot_number)
        sns.boxplot(df[column], palette = "Set2_r")
        plt.xlabel(column, fontsize = 14)
        plt.yticks(rotation = 0, fontsize = 14)
        
    plot_number += 1
    
plt.tight_layout()

There are outliers in the dataset as we can see from the above plot and we are mindful they might have undue influence on our dataset.

In [None]:
# assign the columns to remove outliers if necessary
# columns_to_remove_outliers = ['column1', 'columns2','...']
columns_to_remove_outliers = outliers_columns

## Hybrid outliers Handling Approach

In [None]:
# function to treat outliers
def treat_outliers(df, columns):
    try:
        # loop through all column list
        for column in columns:
            # finding the 1st quartile
            q1 = df[column].quantile(0.25)

            # finding the 3rd quartile
            q3 = df[column].quantile(0.75)

            # get the column mean and median values
            mean = df[column].mean()
            # median = df[column].median()

            # finding the iqr region
            iqr = q3 - q1

            # finding upper and lower whiskers
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)

            # Get the array data for column
            arr1 = df[column]

            # Get the outliers using the upper and lower whiskers
            outliers = arr1[(arr1 <= lower_bound) | (arr1 >= upper_bound)]

            # get the min value of the outliers found
            min_value = round(min(outliers.values)) if len(outliers.values) > 0 else 0

            # calculate the percentage of the outliers found
            percentage_value = round((len(outliers.values) / len(arr1)) * 100, 2) if len(outliers.values) > 0 else 0

            # do this if outliers is less than or equal to 5%
            if round(percentage_value) <= 5 and len(outliers.values) > 0:
                # capping Outliers using IQR Ranges
                df.loc[(df[column] <= lower_bound), column] = lower_bound
                df.loc[(df[column] >= upper_bound), column] = upper_bound

            # do this if outliers is greater than 5%
            elif round(percentage_value) > 5 and len(outliers.values) > 0:
                # replacing outlier values with the mean or median value
                df.loc[(df[column] <= lower_bound), column] = mean
                df.loc[(df[column] >= upper_bound), column] = mean


        # return 
        return 

    except BaseException as error:
        print('\nPlease ensure the dataframe name is correct and the target column is entered correctly: {}'.format(error))

In [None]:
# treating outliers
treat_outliers(df, columns_to_remove_outliers)

# to check the distribution of the remaining columns 
plot_number = 1

num_plots = len(columns_to_remove_outliers) - 1 if len(columns_to_remove_outliers)%2 != 0 else len(columns_to_remove_outliers)

plt.figure(figsize = (num_plots + 4, num_plots * 2), facecolor = "white")

for column in columns_to_remove_outliers:     
    # plot the boxplots
    if plot_number <= num_plots:
        ax = plt.subplot(int(num_plots/2), 2, plot_number)
        sns.boxplot(df[column], palette = "Set2_r")
        plt.xlabel(column, fontsize = 14)
        plt.yticks(rotation = 0, fontsize = 14)
        
    plot_number += 1
    
plt.tight_layout()

As we can see from the above plots, the columns with oultiers have been treated.

### Encoding categorical columns

In [None]:
# encoding categorical columns using OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder
OE = OrdinalEncoder()
for i in df.columns:
    if df[i].dtypes == 'object':
        df[i] = OE.fit_transform(df[i].values.reshape(-1, 1))

df

In [None]:
df.info()

We have converted the categorical columns into numerical columns using the Ordinal Encoding method.

In [None]:
# statistical summary of numerial
df.describe()

After encoding the categorical columns we can see all column details here. The counts of all the columns are the same which means there are no null values in the data. This describe method offers insight into more details of the count, mean, std, min, IQR and max values of all the columns.

## Visualizing the correlation between label and features using bar plot

In [None]:
plt.figure(figsize = (22, 7))
df.corr()["Target"].sort_values(ascending = False).drop(["Target"]).plot(kind='bar', color="m")
plt.xlabel('Feature', fontsize = 15)
plt.ylabel('Target', fontsize = 15)
plt.title('Correlation between label and features using barplot', fontsize=20)
plt.show()

The above barplot displays the correlation between the label and the features

### Seperating features and the target variable

In [None]:
x = df.drop("Target", axis = 1)
y = df["Target"]

Defining and seperating the features and Target variables

### Feature Scaling using Standard Scalarization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = pd.DataFrame(scaler.fit_transform(x), columns = x.columns)
x

We have scaled the data using Standard Scalarization method to overcome the issue of biaseness.

In [None]:
y.value_counts()

In [None]:
# visualising the target variable unique value
ax = sns.countplot(x=y, data=df)

## Model building (without SMOTE) and Evaluation

### Logistic Regression

In [None]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size = 0.30, random_state = 10)

# create the model
model = LogisticRegression()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## Random Forest

In [None]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 10)


# create the model
model = RandomForestClassifier()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## Gradient Boosting

In [None]:
# create the model
model = GradientBoostingClassifier()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## Decision Tree

In [None]:
# create the model
model = DecisionTreeClassifier()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## SVM

In [None]:
# create the model
model = SVC()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## K-NN

In [None]:
# create the model
model = KNeighborsClassifier()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## Model comparison (without SMOTE)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.datasets import load_iris  # Example dataset
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Import the classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Initialize models
models = [
    ("Logistic Regression", make_pipeline(StandardScaler(), LogisticRegression())),
    ("Random Forest", RandomForestClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("SVM", make_pipeline(StandardScaler(), SVC(probability=True))),
    ("KNN", make_pipeline(StandardScaler(), KNeighborsClassifier())),
    ("Decision Tree", DecisionTreeClassifier())
]

# Prepare a list for storing results
results = []

# Train, predict, and evaluate each model
for name, model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    # Collecting metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    # Appending results
    results.append([name, accuracy, f1, precision, recall])

# Creating a DataFrame from the results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])

# Displaying the DataFrame
print(results_df)


## Oversampling Method (SMOTE)

In [None]:
# oversampling the data
SM = SMOTE()
x1, y1 = SM.fit_resample(x, y)

We applied SMOTE here. SMOTE which stands for Synthetic Minority Over-sampling Technique, is a powerful technique for dealing with unbalanced dataset by generating synthetic samples, thus facilitating more robust and accurate machine learning models, especially in the cases where the minority class is of great interest.

In [None]:
y1.value_counts()

In [None]:
# visualising the target variable unique value
ax = sns.countplot(x=y1, data=df)

We now have a balanced data

## Model Building with SMOTE and Evaluation

We aim to build six machine learning models, see the performance of each model and select the best performed model for the project

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

## Logistic Regresion

In [None]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size = 0.30, random_state = 10)

# create the model
model = LogisticRegression()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

We divided our dataset ino training (70%) and testing(30%), builded and evaluated Logistic Regression Model with above displayed metrics. 

## Random Forest

In [None]:
# create the model
model = RandomForestClassifier()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## Decision Tree

In [None]:
# create the model
model = DecisionTreeClassifier()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## Gradient Boosting

In [None]:
# create the model
model = GradientBoostingClassifier()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## Survey Vector Machine (SVM)

In [None]:
# create the model
model = SVC()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## K-Nearest Neighbors

In [None]:
# create the model
model = KNeighborsClassifier()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

We have trained and evaluated six machine learning models which are Logistic Regression, Decision Tree, Random Forest, Gradient Boosting, Survey Vector Machine and K-Nearest neighbors. The performance of each model can be seen in their accuracies, precisions, recalls and F1-scores as displayed above.

## Models Comparison

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.datasets import load_iris  # Example dataset
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Import the classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Initialize models
models = [
    ("Logistic Regression", make_pipeline(StandardScaler(), LogisticRegression())),
    ("Random Forest", RandomForestClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("SVM", make_pipeline(StandardScaler(), SVC(probability=True))),
    ("KNN", make_pipeline(StandardScaler(), KNeighborsClassifier())),
    ("Decision Tree", DecisionTreeClassifier())
]

# Prepare a list for storing results
results = []

# Train, predict, and evaluate each model
for name, model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    # Collecting metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    # Appending results
    results.append([name, accuracy, f1, precision, recall])

# Creating a DataFrame from the results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])

# Displaying the DataFrame
print(results_df)


By comparing the performance of the six models, we can see that Random Forest ranked the best performing model with an accuracy of 83%. we can see that all the models generalized well as they maintain a balanced data across other metrics like F1-scores, precisions Recall etc

In [None]:
x1 = x1.iloc[:, [3, 6, 12, 13, 22, 24, 25, 28, 30, 31]]
x1

Here, we selected features that have strong influence or correlation with the target variable. The next is to retrain our models with the selected 10 features. 

# Retraining the Models on the Selected Features

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

## Logistic Regression

In [None]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size = 0.20, random_state = 0)

# create the model
model = LogisticRegression()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## Random Forest

In [None]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size = 0.20, random_state = 10)

# create the model
model = RandomForestClassifier()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## Decision Tree

In [None]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size = 0.20, random_state = 10)

# create the model
model = DecisionTreeClassifier()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## Gradient Boosting

In [None]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size = 0.20, random_state = 10)

# create the model
model = GradientBoostingClassifier()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## Survey Vector Machine (SVM)

In [None]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size = 0.20, random_state = 0)

# create the model
model = SVC()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

## K-Nearest Neighbors

In [None]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size = 0.20, random_state = 0)

# create the model
model = KNeighborsClassifier()

# train the model
model.fit(x_train, y_train)

# evaluate the model performance
prediction = model.predict(x_test)
accuracy = accuracy_score(y_test, prediction)
print("Accuracy: ", accuracy)
print(classification_report(y_test, prediction))

We can see that after retraining the models with selected features, Random forest still merged the best performed model. The next si to tune our model

## Models Comparison with the Selected Features

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.datasets import load_iris  # Example dataset
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Import the classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size = 0.20, random_state = 0)

# Initialize models
models = [
    ("Logistic Regression", make_pipeline(StandardScaler(), LogisticRegression())),
    ("Random Forest", RandomForestClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("SVM", make_pipeline(StandardScaler(), SVC(probability=True))),
    ("KNN", make_pipeline(StandardScaler(), KNeighborsClassifier())),
    ("Decision Tree", DecisionTreeClassifier())
]

# Prepare a list for storing results
results = []

# Train, predict, and evaluate each model
for name, model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    # Collecting metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    # Appending results
    results.append([name, accuracy, f1, precision, recall])

# Creating a DataFrame from the results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])

# Displaying the DataFrame
print(results_df)

### K-Fold Cross Validation

In [None]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Model definition
model = RandomForestClassifier(random_state=42)

# Setup KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

scores = []
for train_index, test_index in kf.split(x1):
    x_train, x_test = x1.iloc[train_index], x1.iloc[test_index]
    y_train, y_test = y1[train_index], y1[test_index]
    
    # Train the model
    model.fit(x_train, y_train)
    
    # Make predictions and evaluate
    predictions = model.predict(x_test)
    accuracy = accuracy_score(y_test, predictions)
    scores.append(accuracy)

# Convert scores to a numpy array for mean and std calculations
scores = np.array(scores)

# Print the accuracy for each fold
print(f"Accuracy for each fold: {scores}")

# Print the mean accuracy and standard deviation
print(f"Mean accuracy: {scores.mean()}, Standard deviation: {scores.std()}")


### AUC-ROC Scores and Multi-class ROC Curve

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt

# Define and fit the model 
model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)

# Predict class labels for the test set
y_pred = model.predict(x_test)

# Predict probabilities for all classes
y_probs = model.predict_proba(x_test)

# Calculate AUC-ROC Score
# For multiclass, calculate ROC AUC for each class and average them
roc_auc = roc_auc_score(y_test, y_probs, multi_class="ovr")

# Calculate metrics using 'macro' average for multiclass data
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {roc_auc}")

# Plotting ROC Curve for Multiclass  one-vs-rest
fpr = {}
tpr = {}
thresh ={}
n_class = len(set(y_test))

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, y_probs[:,i], pos_label=i)
    
plt.figure(figsize=(6, 4))
for i in range(n_class):
    plt.plot(fpr[i], tpr[i], linestyle='--', label=f'Class {i} vs Rest')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass ROC curve')
plt.legend(loc="lower right")
plt.show()


## ROC for Multi-class

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import numpy as np

# Binarize the output labels for multi-class classification
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2]) 
# Compute ROC curve and ROC area for each class
n_classes = y_test_binarized.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()

# Predict probabilities for each class
y_scores = model.predict_proba(x_test)

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_scores[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting
plt.figure(figsize=(8, 6))
colors = ['aqua', 'darkorange', 'cornflowerblue']
for i, color in enumerate(colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Multi-Class')
plt.legend(loc="lower right")
plt.show()


## Confusion Matrix

In [None]:
from sklearn.ensemble import RandomForestClassifier

# As training data in x_train and labels in y_train
model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)


from sklearn.metrics import confusion_matrix
import seaborn as sns

# Get model predictions
y_pred = model.predict(x_test)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

# As training data in x_train and labels in y_train
model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)

# Use the 'model' variable to access feature_importances_ since that's where your trained model is stored
feature_importances = model.feature_importances_
feature_names = x_train.columns

# Create a pandas series to visualize importance
importances = pd.Series(feature_importances, index=feature_names)

# Plot feature importance
plt.figure(figsize=(12, 8))
importances.nlargest(10).sort_values().plot(kind='barh')  # adjust the number as needed
plt.title('Feature Importance')
plt.xlabel('F Score')
plt.ylabel('Features')
plt.show()



we tuned the best performed model with GridsearchCV to avoid model biaseness

# Explainable (XAI) Machine Learning Model
Explainable Artificial Intelligence (XAI) refers to methods and techniques in the field of artificial intelligence (AI) that make the results and operations of AI systems understandable and interpretable to humans. Therefore, we deploy SHAP to achieve this.

SHAP (SHapley Additive exPlanations) is a game theory-based approach for explaining the output of any machine learning model. It offers insights into how each feature in your dataset contributes to the prediction for each individual observation. SHAP values provide a measure of the impact of each feature on the prediction, compared to the average prediction for the dataset. This approach is grounded in the principles of cooperative game theory and offers a consistent and fair method to distribute the "payout" (i.e., the prediction) among the "players" (i.e., the features)
## Why SHAP?
### Individual Prediction Explanation:
SHAP can explain the output of the model for individual predictions, which is especially useful in applications where understanding why a model made a specific prediction is important.
### Global Understanding:
SHAP also provides global insights by aggregating individual explanations, helping to understand how features generally impact predictions across the dataset.
### Consistency and Fairness:
SHAP values have desirable properties like consistency and local accuracy, which support fairness and transparency in model explanations.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# As training data in x_train and labels in y_train
model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)


In [None]:
pip install shap

In [None]:
import shap
# The trained RandomForestClassifier
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(x_test)  # x_test is your test set features



## Shap Summary Plot for All Classes

In [None]:
# Summary plot for all classes
shap.summary_plot(shap_values, x_test, plot_type="bar")


the above summary plot unveils the average impact of each feature in a particular class on the model prediction. This offers more insight on which features contributing to either positively or negatively to why the model made decision.

## Shap Summary Plot for Class 2

In [None]:
# For class 2
shap.summary_plot(shap_values[2], x_test, plot_type="bar")


## Shap Summary Plot for Class 1

In [None]:
# For class 1
shap.summary_plot(shap_values[1], x_test, plot_type="bar", color='green')


## Shap Summary Plot for Class 0

In [None]:
# For class 0
shap.summary_plot(shap_values[0], x_test, plot_type="bar", color='pink')

From the three (3) barplot above, we can see Shap summary plot on each of the three classes showing clearly, the contribution of each feature in across the 3 classes.


In [None]:
# Initialize JavaScript visualization
shap.initjs()

In [None]:
# Create TreeExplainer object
explainer = shap.TreeExplainer(model)

In [None]:
# Calculate SHAP values for x_test
shap_values = explainer.shap_values(x_test)

# Define class names for readability
class_names = ['DROPOUT', 'ENROLLED', 'GRADUATE']

# Generate and display force plots for each class for the first instance in x_test
for class_idx in range(model.n_classes_):
    # Display the force plot for the first instance in the test set
    shap.force_plot(
        base_value=explainer.expected_value[class_idx], 
        shap_values=shap_values[class_idx][0],         
        features=x_test.iloc[0,:],                      
        feature_names=x1.columns.tolist(),              
        matplotlib=True                              
    )


The Shap force plots for the 3 classes above indicates the impact of each feature on the predictions. those on red indicates they are pushing higher while blue shows lower impacts.

In [None]:
shap.dependence_plot('Curricular units 2nd sem (approved)', shap_values[2], x_test)


In [None]:
shap.dependence_plot('Curricular units 1st sem (approved)', shap_values[2], x_test)
