In [136]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [137]:
def is_binary(df, column_name):
    #Check if a column has only two unique values.
    return len(df[column_name].unique()) == 2


def is_string(df, column_name):
    #Check if a column contains non-numeric values.
    for i in df[column_name]:
        if not str(i).isdigit():
            return True
    return False


def convert_binary_to_numeric(df, column_name):
    #Convert binary categorical columns to numeric values (0/1).
    unique_values = df[column_name].unique()
    if set(unique_values) == set(['Yes', 'No']):
        value_map = {"No": 0, "Yes": 1}
    else:
        value_map = {val: idx for idx, val in enumerate(unique_values)}
    
    df[column_name] = df[column_name].apply(lambda x: value_map[x])
    print(f"The binary column \"{column_name}\" is mapped based on: {value_map}")
    return df

In [138]:
def encode_categorical_columns(df):
    #Label encode non-numeric and binary categorical columns.
    label_encoder = LabelEncoder()
    df_encoded = df.copy()

    for column in df.columns:
        if is_string(df, column):
            if is_binary(df, column):
                df_encoded = convert_binary_to_numeric(df_encoded, column)
            else:
                df_encoded[column] = label_encoder.fit_transform(df[column])
                # Print the mapping between original categories and encoded labels
                mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
                print(f"Mapping for column {column}: {mapping}")
        else:
            df_encoded[column] = df[column]
    
    return df_encoded

In [139]:
# Model Training and Evaluation
def evaluate_model(classifier, X, y, k_splits=5):
    #Evaluate a classifier using KFold cross-validation and print confusion matrices.
    kf = KFold(n_splits=k_splits, shuffle=True)
    conf_matrices = []
    reports = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Fit the classifier
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        
        # Compute confusion matrix and classification report
        conf_matrix = confusion_matrix(y_test, y_pred)
        report_matrix = classification_report(y_test, y_pred)
        
        conf_matrices.append(conf_matrix)
        reports.append(report_matrix)
    
    # Display confusion matrices and reports for each fold
    for i, conf_matrix in enumerate(conf_matrices):
        print(f"Fold {i+1}")
        print(f"Classification Report:\n{reports[i]}")
        print(f"Confusion Matrix:\n{conf_matrix}\n")

In [140]:
def holdout_evaluation(classifier, X_train, X_test, y_train, y_test):
    #Evaluate a classifier on holdout test data.
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    # Print classification report and confusion matrix
    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    print(f"Classification Report:\n{report}")
    print(f"Confusion Matrix:\n{conf_matrix}\n")

In [141]:
def oversample_data(X, y):
    #Perform SMOTE oversampling on the dataset.
    oversampler = SMOTE(random_state=42)
    X_over, y_over = oversampler.fit_resample(X, y)
    
    print(f"Original class distribution: {Counter(y)}")
    print(f"Oversampled class distribution: {Counter(y_over)}")
    
    return X_over, y_over


def undersample_data(X, y):
    #Perform Random undersampling on the dataset.
    undersampler = RandomUnderSampler(random_state=42)
    X_under, y_under = undersampler.fit_resample(X, y)
    
    print(f"Original class distribution: {Counter(y)}")
    print(f"Undersampled class distribution: {Counter(y_under)}")
    
    return X_under, y_under

In [142]:
# Visualization Functions
def plot_histogram(df, column, title):
    #Plot a histogram for a categorical column.
    plt.figure(figsize=(12, 8))
    plt.hist(df[column], bins=len(df[column].unique()), color='skyblue', edgecolor='black')
    plt.title(title)
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.grid(axis='y')
    plt.show()



In [143]:
df = pd.read_csv('Responses.csv')
    
df = df.drop(['Family_Issues', "University_Activities", "Teams","Timestamp"], axis=1)

In [144]:
df_encoded = encode_categorical_columns(df)
X = df_encoded.drop(columns=['Absence'])
y = df_encoded['Absence']

The binary column "Sex" is mapped based on: {'M': 0, 'F': 1}
Mapping for column University: {'AAB': 0, 'AAU': 1, 'AHU': 2, 'AMAU': 3, 'ASU': 4, 'AUM': 5, 'BAUM': 6, 'GJU': 7, 'HTU': 8, 'HU': 9, 'INU': 10, 'IU': 11, 'JU': 12, 'JUST': 13, 'MEU': 14, 'MU': 15, 'PSUT': 16, 'TTU': 17, 'UOP': 18, 'WISE': 19, 'YU': 20, 'ZU': 21, 'ZUJ': 22, 'esrs.unrwa': 23, 'jadara': 24}
Mapping for column collage: {'Allied Medical Sciences': 0, 'Arts and Design': 1, 'Business': 2, 'Childhood': 3, 'Dentistry': 4, 'Educational Sciences': 5, 'Engineering': 6, 'Foreign Languages': 7, 'Information Technology': 8, 'Literature': 9, 'Mathematics': 10, 'Medicine': 11, 'Natural Resources': 12, 'Nursing': 13, 'Pharmacy': 14, 'Rights': 15, 'Science': 16, 'Sharia': 17, 'Tourism': 18}
Mapping for column Hour_fees: {'(10 - 40) JD': 0, '(45 - 80) JD': 1, '(80+) JD': 2}
Mapping for column Grade: {'A': 0, 'B': 1, 'C': 2, 'D': 3}
The binary column "Absence" is mapped based on: {'No': 0, 'Yes': 1}
The binary column "teacher_int

In [145]:
# Split into train-test sets for holdout validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [146]:
# Initialize classifiers
dt_classifier = DecisionTreeClassifier()
nb_classifier = GaussianNB()
knn_classifier = KNeighborsClassifier(n_neighbors=5)
svm_classifier = SVC()
    

# Evaluate models with KFold cross-validation
print("Evaluating Decision Tree Classifier:")
evaluate_model(dt_classifier, X.to_numpy(), y.to_numpy())
    
print("Evaluating Naive Bayes Classifier:")
evaluate_model(nb_classifier, X.to_numpy(), y.to_numpy())
    
print("Evaluating KNN Classifier:")
evaluate_model(knn_classifier, X.to_numpy(), y.to_numpy())
    
print("Evaluating SVM Classifier:")
evaluate_model(svm_classifier, X.to_numpy(), y.to_numpy())
    
    
# Holdout evaluation
print("Holdout Evaluation for Decision Tree:")
holdout_evaluation(dt_classifier, X_train, X_test, y_train, y_test)    

Evaluating Decision Tree Classifier:
Fold 1
Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.56      0.57       183
           1       0.70      0.72      0.71       262

    accuracy                           0.66       445
   macro avg       0.64      0.64      0.64       445
weighted avg       0.65      0.66      0.66       445

Confusion Matrix:
[[103  80]
 [ 73 189]]

Fold 2
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.68      0.65       186
           1       0.75      0.71      0.73       258

    accuracy                           0.70       444
   macro avg       0.69      0.69      0.69       444
weighted avg       0.70      0.70      0.70       444

Confusion Matrix:
[[126  60]
 [ 75 183]]

Fold 3
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.61      0.59       176
           1       0.73   

In [147]:
# Perform SMOTE oversampling
X_over, y_over = oversample_data(X.to_numpy(), y.to_numpy())
    
print("Evaluating SVM Classifier with Oversampled Data:")
evaluate_model(svm_classifier, X_over, y_over)

Original class distribution: Counter({1: 1329, 0: 892})
Oversampled class distribution: Counter({1: 1329, 0: 1329})
Evaluating SVM Classifier with Oversampled Data:
Fold 1
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.79      0.76       260
           1       0.79      0.74      0.76       272

    accuracy                           0.76       532
   macro avg       0.76      0.76      0.76       532
weighted avg       0.76      0.76      0.76       532

Confusion Matrix:
[[205  55]
 [ 71 201]]

Fold 2
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.78      0.75       269
           1       0.76      0.69      0.72       263

    accuracy                           0.74       532
   macro avg       0.74      0.74      0.74       532
weighted avg       0.74      0.74      0.74       532

Confusion Matrix:
[[210  59]
 [ 81 182]]

Fold 3
Classification Report:
       

In [148]:
# Perform undersampling
X_under, y_under = undersample_data(X.to_numpy(), y.to_numpy())
    
print("Evaluating SVM Classifier with Undersampled Data:")
evaluate_model(svm_classifier, X_under, y_under)

Original class distribution: Counter({1: 1329, 0: 892})
Undersampled class distribution: Counter({0: 892, 1: 892})
Evaluating SVM Classifier with Undersampled Data:
Fold 1
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.79      0.74       170
           1       0.78      0.67      0.72       187

    accuracy                           0.73       357
   macro avg       0.73      0.73      0.73       357
weighted avg       0.74      0.73      0.73       357

Confusion Matrix:
[[135  35]
 [ 62 125]]

Fold 2
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.75      0.74       176
           1       0.75      0.73      0.74       181

    accuracy                           0.74       357
   macro avg       0.74      0.74      0.74       357
weighted avg       0.74      0.74      0.74       357

Confusion Matrix:
[[132  44]
 [ 49 132]]

Fold 3
Classification Report:
       