In [None]:
                        #### All Features Code ####
    
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from category_encoders import BinaryEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load and preprocess the data
vle_csv_path_all = r"C:\Users\ah76\Documents\VLEData.csv"
stu_csv_path_all = r"C:\Users\ah76\Documents\StudentData.csv"

vle_df_all = pd.read_csv(vle_csv_path_all, delimiter=",", encoding="latin1")
vle_df_all.rename(columns={'ï»¿id_student': 'id_student'}, inplace=True)
stu_df_all = pd.read_csv(stu_csv_path_all, delimiter=",", encoding="latin1")

data_all = stu_df_all.merge(vle_df_all, on="id_student", how="left")
columns_to_remove_all = ['imd_band', 'Lookup2', 'Lookup', 'id_student', 'date_registration', 'date_unregistration']
data_all = data_all.drop(columns=columns_to_remove_all)

# Define columns for encoding
binary_cols_all = ['code_module_x', 'code_presentation_x']

# Apply Binary Encoding to binary columns
encoder_all = BinaryEncoder(cols=binary_cols_all)
data_encoded_all = encoder_all.fit_transform(data_all)

# Encode the target variable 'final_result'
data_encoded_all['final_result'] = data_encoded_all['final_result'].apply(lambda x: 1 if x == 'Withdrawn' else 0)

# Separate independent variable (X) and dependent variable (y)
X_all = data_encoded_all.drop('final_result', axis=1)
y_all = data_encoded_all['final_result']

# Handle missing values in numerical columns by filling with median
median_fill_columns_all = ['module_presentation_length_x']
X_all[median_fill_columns_all] = X_all[median_fill_columns_all].fillna(X_all[median_fill_columns_all].median())

# Define preprocessing for numerical and categorical features
numeric_features = X_all.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = X_all.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Initialise and fit models with cross-validation
lr_model_all = LogisticRegression(C=1.0, max_iter=10000)
rf_model_all = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2)
dt_model_all = DecisionTreeClassifier(max_depth=5, min_samples_split=2)
gb_model_all = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
knn_model_all = KNeighborsClassifier(n_neighbors=5)

models_all = {
    'Logistic Regression': lr_model_all,
    'Random Forest': rf_model_all,
    'Decision Tree': dt_model_all,
    'Gradient Boosting': gb_model_all,
    'K-Nearest Neighbors': knn_model_all
}

for model_name_all, model_all in models_all.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model_all)])
    y_pred_all_cv = cross_val_predict(pipeline, X_all, y_all, cv=5)
    model_predictions_all[model_name_all] = y_pred_all_cv

# Create an Excel writer
output_path_all = "C:/Users/ah76/Documents/AllFeaturesAll_CV.xlsx"
excel_writer_all = pd.ExcelWriter(output_path_all, engine='xlsxwriter')

# Loop through each model and save classification reports to Excel sheet
for model_name_all, y_pred_all in model_predictions_all.items():
    # Generate the classification report
    report_all = classification_report(y_all, y_pred_all, output_dict=True)
    
    # Extract precision, recall, f1-score, and support values for each class
    class_0_metrics_all = report_all['0']
    class_1_metrics_all = report_all['1']
    
    # Create a DataFrame for the classification report
    report_df_all = pd.DataFrame({
        'Class': ['0', '1', 'accuracy', 'macro avg', 'weighted avg'],
        'Precision': [class_0_metrics_all['precision'], class_1_metrics_all['precision'], report_all['accuracy'], report_all['macro avg']['precision'], report_all['weighted avg']['precision']],
        'Recall': [class_0_metrics_all['recall'], class_1_metrics_all['recall'], report_all['accuracy'], report_all['macro avg']['recall'], report_all['weighted avg']['recall']],
        'F1-Score': [class_0_metrics_all['f1-score'], class_1_metrics_all['f1-score'], report_all['accuracy'], report_all['macro avg']['f1-score'], report_all['weighted avg']['f1-score']],
        'Support': [class_0_metrics_all['support'], class_1_metrics_all['support'], report_all['accuracy'], '', '']
    })

    # Save the classification report to the Excel sheet
    report_df_all.to_excel(excel_writer_all, sheet_name=model_name_all, index=False)

# Save the Excel file
excel_writer_all.save()


In [None]:
                #### Top 10 Features Code ####

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from category_encoders import BinaryEncoder
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif

# Load and preprocess the data
vle_csv_path_10 = r"C:\Users\ah76\Documents\VLEData.csv"
stu_csv_path_10 = r"C:\Users\ah76\Documents\StudentData.csv"

vle_df_10 = pd.read_csv(vle_csv_path_10, delimiter=",", encoding="latin1")
vle_df_10.rename(columns={'ï»¿id_student': 'id_student'}, inplace=True)
stu_df_10 = pd.read_csv(stu_csv_path_10, delimiter=",", encoding="latin1")

data_10 = stu_df_10.merge(vle_df_10, on="id_student", how="left")
columns_to_remove_10 = ['imd_band', 'Lookup2', 'Lookup', 'id_student', 'date_registration', 'date_unregistration']
data_10 = data_10.drop(columns=columns_to_remove_10)

# Define columns for encoding
binary_cols_10 = ['code_module_x', 'code_presentation_x']

# Apply Binary Encoding to binary columns
encoder_10 = BinaryEncoder(cols=binary_cols_10)
data_encoded_10 = encoder_10.fit_transform(data_10)

# Encode the target variable 'final_result'
data_encoded_10['final_result'] = data_encoded_10['final_result'].apply(lambda x: 1 if x == 'Withdrawn' else 0)

# Separate independent variable (X_10) and dependent variable (y_10)
X_10 = data_encoded_10.drop('final_result', axis=1)
y_10 = data_encoded_10['final_result']

# Handle missing values in numerical columns by filling with median
median_fill_columns_10 = ['module_presentation_length_x']
X_10[median_fill_columns_10] = X_10[median_fill_columns_10].fillna(X_10[median_fill_columns_10].median())

# One-Hot Encoding for categorical columns
categorical_cols_10 = ['gender', 'region', 'highest_education', 'age_band', 'disability', 'activity_type']
X_10 = pd.get_dummies(X_10, columns=categorical_cols_10)

# Handle any missing values in the encoded data
X_10 = X_10.fillna(0)

# Initialise and fit models
lr_model_10 = LogisticRegression(C=1.0, max_iter=10000)
rf_model_10 = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2)
dt_model_10 = DecisionTreeClassifier(max_depth=5, min_samples_split=2)
gb_model_10 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)

# Create a dictionary to store the top 10 feature names for each model
top__10_features_10 = {}

# Loop through models to perform feature selection and training
models_10 = {
    'Logistic Regression': lr_model_10,
    'Random Forest': rf_model_10,
    'Decision Tree': dt_model_10,
    'Gradient Boosting': gb_model_10,
}

for model_name_10, model_10 in models_10.items():
    # Perform feature selection using SelectKBest with k=10
    selector_10 = SelectKBest(score_func=f_classif, k=10)
    X_selected_10 = selector_10.fit_transform(X_10, y_10)
    
    # Get the selected feature indices
    selected_feature_indices_10 = selector_10.get_support(indices=True)
    
    # Get the top 10 feature names
    selected_features_10 = X_10.columns[selected_feature_indices_10]
    
    # Store the top 10 feature names
    top__10_features_10[model_name_10] = selected_features_10
    
    # Set the feature names for the input data
    X_selected_10 = pd.DataFrame(X_selected_10, columns=selected_features_10)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model_10, X_selected_10, y_10, cv=5, scoring='accuracy')
    
    # Print the cross-validation scores
    print(f"Cross-validation scores for {model_name_10}: {cv_scores}")
    
    # Train the model using only the top 10 features
    model_10.fit(X_selected_10, y_10)

# Initialise a dictionary to store classification reports
classification_reports_10 = {}

# Loop through models to make predictions and generate classification reports
for model_name_10, model_10 in models_10.items():
    # Get the top _10 features for this model
    selected_features_10 = top__10_features_10[model_name_10]
    
    # Transform the test data to select the same top _10 features
    X_test_selected_10 = X_10[selected_features_10]
    
    # Make predictions using the top _10 features
    y_pred_10 = model_10.predict(X_test_selected_10)
    
    # Generate the classification report
    report_10 = classification_report(y_10, y_pred_10, output_dict=True)
    
    # Add the classification report to the dictionary
    classification_reports_10[model_name_10] = report_10

# Create an Excel writer
output_path_10 = "C:/Users/ah76/Documents/Top_10_cv_T.xlsx"
excel_writer_10 = pd.ExcelWriter(output_path_10, engine='xlsxwriter')

# Loop through each model and save classification reports to Excel sheet
for model_name_10, report_10 in classification_reports_10.items():
    report_df_10 = pd.DataFrame(report_10).transpose()
    report_df_10.to_excel(excel_writer_10, sheet_name=model_name_10)

# Save the Excel file
excel_writer_10.save()


In [None]:
                #### Top 10 Hyperperameter Tuning Code ####
    
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV  
from sklearn.metrics import classification_report

# Define the range of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_10, y_10, test_size=0.2, random_state=42)

# Create the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Create the Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform the Grid Search on the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the Random Forest model with the best hyperparameters on the training set
best_rf_model = RandomForestClassifier(random_state=42, **best_params)
best_rf_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = best_rf_model.predict(X_test)

# Generate a classification report for the tuned model
tuned_rf_report = classification_report(y_test, y_pred)
print("Classification Report for Tuned Random Forest:")
print(tuned_rf_report)

# Generate a classification report for the tuned model in dictionary format
tuned_rf_report_dict = classification_report(y_test, y_pred, output_dict=True)

# Convert the dictionary to a DataFrame and transpose it
tuned_rf_report_df = pd.DataFrame(tuned_rf_report_dict).T

# Save the transposed report to an Excel file 
output_path = "C:/Users/ah76/Documents/retunedCVT.xlsx"
tuned_rf_report_df.to_excel(output_path, index=True)



In [None]:
#### Top 5 Features Code ####

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from category_encoders import BinaryEncoder
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif

# Load and preprocess the data
vle_csv_path_5 = r"C:\Users\ah76\Documents\VLEData.csv"
stu_csv_path_5 = r"C:\Users\ah76\Documents\StudentData.csv"

vle_df_5 = pd.read_csv(vle_csv_path_5, delimiter=",", encoding="latin1")
vle_df_5.rename(columns={'ï»¿id_student': 'id_student'}, inplace=True)
stu_df_5 = pd.read_csv(stu_csv_path_5, delimiter=",", encoding="latin1")

data_5 = stu_df_5.merge(vle_df_5, on="id_student", how="left")
columns_to_remove_5 = ['imd_band', 'Lookup2', 'Lookup', 'id_student', 'date_registration', 'date_unregistration']
data_5 = data_5.drop(columns=columns_to_remove_5)

# Define columns for encoding
binary_cols_5 = ['code_module_x', 'code_presentation_x']

# Apply Binary Encoding to binary columns
encoder_5 = BinaryEncoder(cols=binary_cols_5)
data_encoded_5 = encoder_5.fit_transform(data_5)

# Encode the target variable 'final_result'
data_encoded_5['final_result'] = data_encoded_5['final_result'].apply(lambda x: 1 if x == 'Withdrawn' else 0)

# Separate independent variable (X_5) and dependent variable (y_5)
X_5 = data_encoded_5.drop('final_result', axis=1)
y_5 = data_encoded_5['final_result']

# Handle missing values in numerical columns by filling with median
median_fill_columns_5 = ['module_presentation_length_x']
X_5[median_fill_columns_5] = X_5[median_fill_columns_5].fillna(X_5[median_fill_columns_5].median())

# One-Hot Encoding for categorical columns
categorical_cols_5 = ['gender', 'region', 'highest_education', 'age_band', 'disability', 'activity_type']
X_5 = pd.get_dummies(X_5, columns=categorical_cols_5)

# Handle any missing values in the encoded data
X_5 = X_5.fillna(0)

# Initialise and fit models
lr_model_5 = LogisticRegression(C=1.0, max_iter=10000)
rf_model_5 = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2)
dt_model_5 = DecisionTreeClassifier(max_depth=5, min_samples_split=2)
gb_model_5 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)

# Create a dictionary to store the top 5 feature names for each model
top_5_features_5 = {}

# Loop through models to perform feature selection and training
models_5 = {
    'Logistic Regression': lr_model_5,
    'Random Forest': rf_model_5,
    'Decision Tree': dt_model_5,
    'Gradient Boosting': gb_model_5,
}

for model_name_5, model_5 in models_5.items():
    # Perform feature selection using SelectKBest with k=5
    selector_5 = SelectKBest(score_func=f_classif, k=5)
    X_selected_5 = selector_5.fit_transform(X_5, y_5)
    
    # Get the selected feature indices
    selected_feature_indices_5 = selector_5.get_support(indices=True)
    
    # Get the top 5 feature names
    selected_features_5 = X_5.columns[selected_feature_indices_5]
    
    # Store the top 5 feature names
    top_5_features_5[model_name_5] = selected_features_5
    
    # Set the feature names for the input data
    X_selected_5 = pd.DataFrame(X_selected_5, columns=selected_features_5)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model_5, X_selected_5, y_5, cv=5, scoring='accuracy')
    
    # Print the cross-validation scores
    print(f"Cross-validation scores for {model_name_5}: {cv_scores}")
    
    # Train the model using only the top 5 features
    model_5.fit(X_selected_5, y_5)

# Initialise a dictionary to store classification reports
classification_reports_5 = {}

# Loop through models to make predictions and generate classification reports
for model_name_5, model_5 in models_5.items():
    # Get the top 5 features for this model
    selected_features_5 = top_5_features_5[model_name_5]
    
    # Transform the test data to select the same top 5 features
    X_test_selected_5 = X_5[selected_features_5]
    
    # Make predictions using the top 5 features
    y_pred_5 = model_5.predict(X_test_selected_5)
    
    # Generate the classification report
    report_5 = classification_report(y_5, y_pred_5, output_dict=True)
    
    # Add the classification report to the dictionary
    classification_reports_5[model_name_5] = report_5

# Create an Excel writer
output_path_5 = "C:/Users/ah76/Documents/Top_5_cv_T.xlsx"
excel_writer_5 = pd.ExcelWriter(output_path_5, engine='xlsxwriter')

# Loop through each model and save classification reports to Excel sheet
for model_name_5, report_5 in classification_reports_5.items():
    report_df_5 = pd.DataFrame(report_5).transpose()
    report_df_5.to_excel(excel_writer_5, sheet_name=model_name_5)

# Save the Excel file
excel_writer_5.save()


In [None]:
#### Top 15 Features Code ####

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from category_encoders import BinaryEncoder
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif

# Load and preprocess the data
vle_csv_path_15 = r"C:\Users\ah76\Documents\VLEData.csv"
stu_csv_path_15 = r"C:\Users\ah76\Documents\StudentData.csv"

vle_df_15 = pd.read_csv(vle_csv_path_15, delimiter=",", encoding="latin1")
vle_df_15.rename(columns={'ï»¿id_student': 'id_student'}, inplace=True)
stu_df_15 = pd.read_csv(stu_csv_path_15, delimiter=",", encoding="latin1")

data_15 = stu_df_15.merge(vle_df_15, on="id_student", how="left")
columns_to_remove_15 = ['imd_band', 'Lookup2', 'Lookup', 'id_student', 'date_registration', 'date_unregistration']
data_15 = data_15.drop(columns=columns_to_remove_15)

# Define columns for encoding
binary_cols_15 = ['code_module_x', 'code_presentation_x']

# Apply Binary Encoding to binary columns
encoder_15 = BinaryEncoder(cols=binary_cols_15)
data_encoded_15 = encoder_15.fit_transform(data_15)

# Encode the target variable 'final_result'
data_encoded_15['final_result'] = data_encoded_15['final_result'].apply(lambda x: 1 if x == 'Withdrawn' else 0)

# Separate independent variable (X_15) and dependent variable (y_15)
X_15 = data_encoded_15.drop('final_result', axis=1)
y_15 = data_encoded_15['final_result']

# Handle missing values in numerical columns by filling with median
median_fill_columns_15 = ['module_presentation_length_x']
X_15[median_fill_columns_15] = X_15[median_fill_columns_15].fillna(X_15[median_fill_columns_15].median())

# One-Hot Encoding for categorical columns
categorical_cols_15 = ['gender', 'region', 'highest_education', 'age_band', 'disability', 'activity_type']
X_15 = pd.get_dummies(X_15, columns=categorical_cols_15)

# Handle any missing values in the encoded data
X_15 = X_15.fillna(0)

# Initialise and fit models
lr_model_15 = LogisticRegression(C=1.0, max_iter=10000)
rf_model_15 = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2)
dt_model_15 = DecisionTreeClassifier(max_depth=5, min_samples_split=2)
gb_model_15 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)

# Create a dictionary to store the top 15 feature names for each model
top_15_features_15 = {}

# Loop through models to perform feature selection and training
models_15 = {
    'Logistic Regression': lr_model_15,
    'Random Forest': rf_model_15,
    'Decision Tree': dt_model_15,
    'Gradient Boosting': gb_model_15,
}

for model_name_15, model_15 in models_15.items():
    # Perform feature selection using SelectKBest with k=15
    selector_15 = SelectKBest(score_func=f_classif, k=15)
    X_selected_15 = selector_15.fit_transform(X_15, y_15)
    
    # Get the selected feature indices
    selected_feature_indices_15 = selector_15.get_support(indices=True)
    
    # Get the top 15 feature names
    selected_features_15 = X_15.columns[selected_feature_indices_15]
    
    # Store the top 15 feature names
    top_15_features_15[model_name_15] = selected_features_15
    
    # Set the feature names for the input data
    X_selected_15 = pd.DataFrame(X_selected_15, columns=selected_features_15)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model_15, X_selected_15, y_15, cv=5, scoring='accuracy')
    
    # Print the cross-validation scores
    print(f"Cross-validation scores for {model_name_15}: {cv_scores}")
    
    # Train the model using only the top 15 features
    model_15.fit(X_selected_15, y_15)

# Initialise a dictionary to store classification reports
classification_reports_15 = {}

# Loop through models to make predictions and generate classification reports
for model_name_15, model_15 in models_15.items():
    # Get the top 15 features for this model
    selected_features_15 = top_15_features_15[model_name_15]
    
    # Transform the test data to select the same top 15 features
    X_test_selected_15 = X_15[selected_features_15]
    
    # Make predictions using the top 15 features
    y_pred_15 = model_15.predict(X_test_selected_15)
    
    # Generate the classification report
    report_15 = classification_report(y_15, y_pred_15, output_dict=True)
    
    # Add the classification report to the dictionary
    classification_reports_15[model_name_15] = report_15

# Create an Excel writer
output_path_15 = "C:/Users/ah76/Documents/Top_15_cv_T.xlsx"
excel_writer_15 = pd.ExcelWriter(output_path_15, engine='xlsxwriter')

# Loop through each model and save classification reports to Excel sheet
for model_name_15, report_15 in classification_reports_15.items():
    report_df_15 = pd.DataFrame(report_15).transpose()
    report_df_15.to_excel(excel_writer_15, sheet_name=model_name_15)

# Save the Excel file
excel_writer_15.save()
