In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the dataset
df=pd.read_csv("data.csv",sep=";")

# Display the first few rows of the dataset to confirm its structure
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [3]:
# Filtering the dataset to include only "Dropout" and "Graduate" students
filtered_df = df[df['Target'].isin(['Dropout', 'Graduate'])]

# Display the filtered dataset
filtered_df

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,Graduate
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,Dropout
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,Dropout
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate


In [4]:
# Display the unique values in the 'Target' column to confirm we have only "Dropout" and "Graduate"
filtered_df['Target'].unique()

array(['Dropout', 'Graduate'], dtype=object)

In [5]:
# Extracting the specified columns of interest for the analysis
columns_of_interest = [
    'Marital status', 
    'Mother\'s qualification', 
    'Father\'s qualification', 
    'Mother\'s occupation', 
    'Father\'s occupation', 
    'Target'
]

extracted_df = filtered_df[columns_of_interest]

# Display the first few rows of the extracted dataset to confirm the extraction
extracted_df.head()

Unnamed: 0,Marital status,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Target
0,1,19,12,5,9,Dropout
1,1,1,3,3,3,Graduate
2,1,37,37,9,9,Dropout
3,1,38,37,5,3,Graduate
4,2,37,38,9,9,Graduate


In [6]:
# Show the general information of the filtered data

extracted_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3630 entries, 0 to 4423
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Marital status          3630 non-null   int64 
 1   Mother's qualification  3630 non-null   int64 
 2   Father's qualification  3630 non-null   int64 
 3   Mother's occupation     3630 non-null   int64 
 4   Father's occupation     3630 non-null   int64 
 5   Target                  3630 non-null   object
dtypes: int64(5), object(1)
memory usage: 198.5+ KB


In [7]:
# General statistics of the filtered data

extracted_df.describe()

Unnamed: 0,Marital status,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation
count,3630.0,3630.0,3630.0,3630.0,3630.0
mean,1.184298,19.986226,22.571625,10.138567,10.28292
std,0.613009,15.585278,15.275453,23.315697,22.40269
min,1.0,1.0,1.0,0.0,0.0
25%,1.0,2.0,3.0,4.0,4.0
50%,1.0,19.0,19.0,5.0,7.0
75%,1.0,37.0,37.0,9.0,9.0
max,6.0,44.0,44.0,194.0,195.0


In [8]:
# Encoding the categorical variables
encoder = LabelEncoder()

# Creating a copy of the extracted dataset to apply encoding
encoded_df = extracted_df.copy()

# Encoding the 'Target' column
encoded_df['Target'] = encoder.fit_transform(extracted_df['Target'])

# Since other columns are categorical but represented as integers, we don't need to encode them
# We'll just ensure they are of integer type
encoded_df = encoded_df.astype(int)

# Display the first few rows of the encoded dataset to confirm encoding
encoded_df

Unnamed: 0,Marital status,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Target
0,1,19,12,5,9,0
1,1,1,3,3,3,1
2,1,37,37,9,9,0
3,1,38,37,5,3,1
4,2,37,38,9,9,1
...,...,...,...,...,...,...
4419,1,1,1,5,4,1
4420,1,1,1,9,9,0
4421,1,37,37,9,9,0
4422,1,37,37,7,4,1


In [9]:
# Handling outliers using the IQR method
def handle_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Replace outliers with the median value
        df[col] = np.where((df[col] < lower_bound) | (df[col] > upper_bound), df[col].median(), df[col])
    return df

# Define the numerical columns to handle outliers
numerical_columns = ['Marital status', 'Mother\'s qualification', 'Father\'s qualification', 'Mother\'s occupation', 'Father\'s occupation']

# Handle outliers in the numerical columns
encoded_df = handle_outliers(encoded_df, numerical_columns)
encoded_df[numerical_columns] = encoded_df[numerical_columns].astype(int)

encoded_df

Unnamed: 0,Marital status,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Target
0,1,19,12,5,9,0
1,1,1,3,3,3,1
2,1,37,37,9,9,0
3,1,38,37,5,3,1
4,1,37,38,9,9,1
...,...,...,...,...,...,...
4419,1,1,1,5,4,1
4420,1,1,1,9,9,0
4421,1,37,37,9,9,0
4422,1,37,37,7,4,1


In [10]:
# Splitting the data into training and testing sets

# Features (X) and Target (y)
X = encoded_df.drop(columns=['Target'])
y = encoded_df['Target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shapes of the training and testing sets to confirm the split
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2541, 5), (1089, 5), (2541,), (1089,))

In [43]:
# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Create Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, 
                           scoring='accuracy', cv=5, n_jobs=-1)

# Fit Grid Search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train a Random Forest model with the best hyperparameters
best_rf_model = RandomForestClassifier(random_state=42, **best_params)
best_rf_model.fit(X_train, y_train)

# Evaluate the model on the test data
test_accuracy = best_rf_model.score(X_test, y_test)

print("Best Hyperparameters:", best_params)
print("Test Accuracy with Best Model:", test_accuracy)

540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
222 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\walla\anaconda3\envs\dev\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\walla\anaconda3\envs\dev\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\walla\anaconda3\envs\dev\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\walla\anaconda3\envs\dev\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_paramete

Best Hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Test Accuracy with Best Model: 0.6345270890725436


In [11]:
# Training the Random Forest Classifier model

# Initializing the model
rf_model = RandomForestClassifier(
    random_state=42,
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='sqrt'
)

# Training the model
rf_model.fit(X_train, y_train)

# Predicting the target values for the testing set
y_pred = rf_model.predict(X_test)

# Display the first few predictions to confirm the model's operation
y_pred[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [12]:
# Evaluating the model's performance

# Calculating the classification report which includes precision, recall, f1-score, and support
classification_rep = classification_report(y_test, y_pred, target_names=['Dropout', 'Graduate'])

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Creating a formatted output
output = f"""
### Predictors:
- Marital Status
- Mother's Qualification
- Father's Qualification
- Mother's Occupation
- Father's Occupation

### Classification Report:
{classification_rep}

### Accuracy:
- The model's accuracy is {accuracy*100:.2f}%. This means that the model correctly predicted whether students graduated or dropped out in
{accuracy*100:.2f}% of the cases in the testing set.

### Analysis:
- Precision for Dropout: Indicates the proportion of students predicted to dropout that actually dropped out.
- Recall for Dropout: Indicates the proportion of actual dropouts that were correctly identified by the model.
- Precision for Graduate: Indicates the proportion of students predicted to graduate that actually graduated.
- Recall for Graduate: Indicates the proportion of actual graduates that were correctly identified by the model.
"""

# Printing the formatted output
print(output)


### Predictors:
- Marital Status
- Mother's Qualification
- Father's Qualification
- Mother's Occupation
- Father's Occupation

### Classification Report:
              precision    recall  f1-score   support

     Dropout       0.56      0.17      0.26       414
    Graduate       0.64      0.92      0.76       675

    accuracy                           0.63      1089
   macro avg       0.60      0.54      0.51      1089
weighted avg       0.61      0.63      0.57      1089


### Accuracy:
- The model's accuracy is 63.27%. This means that the model correctly predicted whether students graduated or dropped out in
63.27% of the cases in the testing set.

### Analysis:
- Precision for Dropout: Indicates the proportion of students predicted to dropout that actually dropped out.
- Recall for Dropout: Indicates the proportion of actual dropouts that were correctly identified by the model.
- Precision for Graduate: Indicates the proportion of students predicted to graduate that actually gr

In [13]:
# Evaluating feature importance using the trained Random Forest model

# Getting feature importances
importances = rf_model.feature_importances_

# Creating a DataFrame to display features and their importance scores
features_importance = pd.DataFrame({'Feature': X.columns, 'Importance (%)': importances})

# Converting importance scores to percentages
features_importance['Importance (%)'] = features_importance['Importance (%)'] * 100

# Sorting the features by importance
features_importance = features_importance.sort_values(by='Importance (%)', ascending=False)

# Displaying the features and their importance scores
features_importance

Unnamed: 0,Feature,Importance (%)
3,Mother's occupation,28.438085
4,Father's occupation,27.63312
1,Mother's qualification,22.863441
2,Father's qualification,21.065353
0,Marital status,0.0


### Analysis:

- **Father's Occupation** and **Mother's Occupatio**n have the highest importance, indicating that they are significant predictors in determining whether a student will graduate or drop out.
- **Marital Status** has the least importance among the features.

In [47]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
}

# Create GridSearchCV
logistic_regression_grid_search = GridSearchCV(
    logistic_regression,
    param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',  # Use accuracy as the evaluation metric
    verbose=1,
    n_jobs=-1  # Use all available CPU cores
)

# Fit the model
logistic_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters and test accuracy
best_logistic_regression = logistic_regression_grid_search.best_estimator_
test_accuracy_lr = best_logistic_regression.score(X_test, y_test)

print("Best Logistic Regression Hyperparameters:", logistic_regression_grid_search.best_params_)
print("Test Accuracy with Best Logistic Regression Model:", test_accuracy_lr)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Logistic Regression Hyperparameters: {'C': 0.01}
Test Accuracy with Best Logistic Regression Model: 0.6189164370982553


In [48]:
from sklearn.svm import SVC

# Create an SVM classifier
svm = SVC(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
}

# Create GridSearchCV
svm_grid_search = GridSearchCV(
    svm,
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit the model
svm_grid_search.fit(X_train, y_train)

# Get the best hyperparameters and test accuracy
best_svm = svm_grid_search.best_estimator_
test_accuracy_svm = best_svm.score(X_test, y_test)

print("Best SVM Hyperparameters:", svm_grid_search.best_params_)
print("Test Accuracy with Best SVM Model:", test_accuracy_svm)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best SVM Hyperparameters: {'C': 10, 'kernel': 'rbf'}
Test Accuracy with Best SVM Model: 0.6391184573002755


In [15]:
import xgboost as xgb

# Create an XGBoost classifier
xgboost = xgb.XGBClassifier(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
}

# Create GridSearchCV
xgboost_grid_search = GridSearchCV(
    xgboost,
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit the model
xgboost_grid_search.fit(X_train, y_train)

# Get the best hyperparameters and test accuracy
best_xgboost = xgboost_grid_search.best_estimator_
test_accuracy_xgboost = best_xgboost.score(X_test, y_test)

print("Best XGBoost Hyperparameters:", xgboost_grid_search.best_params_)
print("Test Accuracy with Best XGBoost Model:", test_accuracy_xgboost)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best XGBoost Hyperparameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}
Test Accuracy with Best XGBoost Model: 0.6326905417814509


In [16]:
from sklearn.neural_network import MLPClassifier

# Initialize and train the MLPClassifier
mlp_classifier = MLPClassifier(
    hidden_layer_sizes=(100, ),  # Single hidden layer with 100 neurons
    activation='relu',           # Rectified Linear Unit activation function
    solver='adam',               # Optimization algorithm
    alpha=0.0001,                # L2 regularization parameter
    random_state=42,
    max_iter=200,                # Maximum number of iterations
    verbose=True                 # Enable verbose mode for training progress
)

mlp_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = mlp_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
accuracy

Iteration 1, loss = 2.17623988
Iteration 2, loss = 1.00082169
Iteration 3, loss = 0.80110599
Iteration 4, loss = 0.72584200
Iteration 5, loss = 0.70314882
Iteration 6, loss = 0.69132793
Iteration 7, loss = 0.68481565
Iteration 8, loss = 0.67945200
Iteration 9, loss = 0.67596298
Iteration 10, loss = 0.67288908
Iteration 11, loss = 0.66955906
Iteration 12, loss = 0.66725267
Iteration 13, loss = 0.66454043
Iteration 14, loss = 0.66368330
Iteration 15, loss = 0.66540374
Iteration 16, loss = 0.66162509
Iteration 17, loss = 0.66066285
Iteration 18, loss = 0.66058725
Iteration 19, loss = 0.65971218
Iteration 20, loss = 0.66042760
Iteration 21, loss = 0.65989603
Iteration 22, loss = 0.65971396
Iteration 23, loss = 0.65862284
Iteration 24, loss = 0.66023100
Iteration 25, loss = 0.65808554
Iteration 26, loss = 0.65807314
Iteration 27, loss = 0.65773705
Iteration 28, loss = 0.65780088
Iteration 29, loss = 0.65829257
Iteration 30, loss = 0.65873286
Iteration 31, loss = 0.65756287
Iteration 32, los

0.6170798898071626