In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [17]:
# Load the dataset
df=pd.read_csv("data.csv",sep=";")

# Display the first few rows of the dataset to confirm its structure
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [23]:
# Filtering the dataset to include only "Dropout" and "Graduate" students
filtered_df = df[df['Target'].isin(['Dropout', 'Graduate'])]

# Display the filtered dataset
filtered_df

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,Graduate
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,Dropout
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,Dropout
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate


In [24]:
# Display the unique values in the 'Target' column to confirm we have only "Dropout" and "Graduate"
filtered_df['Target'].unique()

array(['Dropout', 'Graduate'], dtype=object)

In [28]:
# Extracting the specified columns of interest for the analysis
columns_of_interest = [
    'Marital status', 
    'Mother\'s qualification', 
    'Father\'s qualification', 
    'Mother\'s occupation', 
    'Father\'s occupation', 
    'Target'
]

extracted_df = filtered_df[columns_of_interest]

# Display the first few rows of the extracted dataset to confirm the extraction
extracted_df.head()

Unnamed: 0,Marital status,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Target
0,1,19,12,5,9,Dropout
1,1,1,3,3,3,Graduate
2,1,37,37,9,9,Dropout
3,1,38,37,5,3,Graduate
4,2,37,38,9,9,Graduate


In [31]:
# Show the general information of the filtered data

extracted_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3630 entries, 0 to 4423
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Marital status          3630 non-null   int64 
 1   Mother's qualification  3630 non-null   int64 
 2   Father's qualification  3630 non-null   int64 
 3   Mother's occupation     3630 non-null   int64 
 4   Father's occupation     3630 non-null   int64 
 5   Target                  3630 non-null   object
dtypes: int64(5), object(1)
memory usage: 198.5+ KB


In [34]:
# General statistics of the filtered data

extracted_df.describe()

Unnamed: 0,Marital status,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation
count,3630.0,3630.0,3630.0,3630.0,3630.0
mean,1.184298,19.986226,22.571625,10.138567,10.28292
std,0.613009,15.585278,15.275453,23.315697,22.40269
min,1.0,1.0,1.0,0.0,0.0
25%,1.0,2.0,3.0,4.0,4.0
50%,1.0,19.0,19.0,5.0,7.0
75%,1.0,37.0,37.0,9.0,9.0
max,6.0,44.0,44.0,194.0,195.0


In [35]:
# Encoding the categorical variables
encoder = LabelEncoder()

# Creating a copy of the extracted dataset to apply encoding
encoded_df = extracted_df.copy()

# Encoding the 'Target' column
encoded_df['Target'] = encoder.fit_transform(extracted_df['Target'])

# Since other columns are categorical but represented as integers, we don't need to encode them
# We'll just ensure they are of integer type
encoded_df = encoded_df.astype(int)

# Display the first few rows of the encoded dataset to confirm encoding
encoded_df.head()

Unnamed: 0,Marital status,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Target
0,1,19,12,5,9,0
1,1,1,3,3,3,1
2,1,37,37,9,9,0
3,1,38,37,5,3,1
4,2,37,38,9,9,1


In [36]:
# Splitting the data into training and testing sets

# Features (X) and Target (y)
X = encoded_df.drop(columns=['Target'])
y = encoded_df['Target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shapes of the training and testing sets to confirm the split
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2541, 5), (1089, 5), (2541,), (1089,))

In [37]:
# Training the Random Forest Classifier model

# Initializing the model
rf_model = RandomForestClassifier(random_state=42)

# Training the model
rf_model.fit(X_train, y_train)

# Predicting the target values for the testing set
y_pred = rf_model.predict(X_test)

# Display the first few predictions to confirm the model's operation
y_pred[:10]

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1])

In [41]:
# Evaluating the model's performance

# Calculating the classification report which includes precision, recall, f1-score, and support
classification_rep = classification_report(y_test, y_pred, target_names=['Dropout', 'Graduate'])

# Creating a formatted output
output = f"""
### Predictors:
- Marital Status
- Mother's Qualification
- Father's Qualification
- Mother's Occupation
- Father's Occupation

### Classification Report:
{classification_rep}

### Accuracy:
- The model's accuracy is {accuracy*100:.2f}%. This means that the model correctly predicted whether students graduated or dropped out in {accuracy*100:.2f}% of the cases in the testing set.

### Analysis:
- Precision for Dropout: Indicates the proportion of students predicted to dropout that actually dropped out.
- Recall for Dropout: Indicates the proportion of actual dropouts that were correctly identified by the model.
- Precision for Graduate: Indicates the proportion of students predicted to graduate that actually graduated.
- Recall for Graduate: Indicates the proportion of actual graduates that were correctly identified by the model.
"""

# Printing the formatted output
print(output)


### Predictors:
- Marital Status
- Mother's Qualification
- Father's Qualification
- Mother's Occupation
- Father's Occupation

### Classification Report:
              precision    recall  f1-score   support

     Dropout       0.48      0.37      0.42       414
    Graduate       0.66      0.75      0.70       675

    accuracy                           0.61      1089
   macro avg       0.57      0.56      0.56      1089
weighted avg       0.59      0.61      0.59      1089


### Accuracy:
- The model's accuracy is 60.61%. This means that the model correctly predicted whether students graduated or dropped out in 60.61% of the cases in the testing set.

### Analysis:
- Precision for Dropout: Indicates the proportion of students predicted to dropout that actually dropped out.
- Recall for Dropout: Indicates the proportion of actual dropouts that were correctly identified by the model.
- Precision for Graduate: Indicates the proportion of students predicted to graduate that actually gr

In [44]:
# Evaluating feature importance using the trained Random Forest model

# Getting feature importances
importances = rf_model.feature_importances_

# Creating a DataFrame to display features and their importance scores
features_importance = pd.DataFrame({'Feature': X.columns, 'Importance (%)': importances})

# Converting importance scores to percentages
features_importance['Importance (%)'] = features_importance['Importance (%)'] * 100

# Sorting the features by importance
features_importance = features_importance.sort_values(by='Importance (%)', ascending=False)

# Displaying the features and their importance scores
features_importance

Unnamed: 0,Feature,Importance (%)
4,Father's occupation,29.361396
3,Mother's occupation,26.57303
2,Father's qualification,19.652906
1,Mother's qualification,19.441523
0,Marital status,4.971144


### Analysis:

- **Father's Occupation** and **Mother's Occupatio**n have the highest importance, indicating that they are significant predictors in determining whether a student will graduate or drop out.
- **Marital Status** has the least importance among the features.