In [1]:
# Install necessary packages
!pip install pandas scikit-learn joblib




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Set display options for pandas
pd.set_option('display.max_column', 50)

# Load the best model and the scaler
best_rf_model = joblib.load('models/random_forest_model.pkl')
scaler = joblib.load('models/scaler.pkl')

# Load the new test data
test_data_path = 'dataset/test_data.csv'
data = pd.read_csv(test_data_path)

# List of columns to keep (same as training data columns)
columns_to_keep = [
    'Application mode', 'Application order', 'Course', 
    'Previous qualification (grade)', 'Mother\'s qualification', 'Father\'s qualification', 
    'Mother\'s occupation', 'Father\'s occupation', 'Admission grade', 'Debtor', 
    'Tuition fees up to date', 'Scholarship holder', 'Age at enrollment', 
    'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 
    'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 
    'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 
    'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 
    'Unemployment rate', 'Inflation rate', 'GDP'
]

# Keep only the specified columns in the data
data = data[columns_to_keep]

In [3]:
# Step 1: Encode categorical columns (if any)
# Use LabelEncoder for each categorical column that the model may expect
categorical_columns = ['Application mode', 'Course', 'Previous qualification (grade)', 
                       'Mother\'s qualification', 'Father\'s qualification', 
                       'Mother\'s occupation', 'Father\'s occupation', 
                       'Debtor', 'Tuition fees up to date', 'Scholarship holder']

# Apply LabelEncoder
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))  # Encoding as string, to handle any unseen values
    label_encoders[col] = le  # Save the label encoder for future use

In [4]:
# Step 2: Scale numerical data
# Scale the data using the same scaler that was used during model training
data_scaled = scaler.transform(data)

In [5]:
# Step 3: Make predictions using the trained model
predictions = best_rf_model.predict(data_scaled)

In [6]:
# Step 4: Map predictions (assuming 0 = dropout, 1 = graduate)
prediction_labels = ['Dropout', 'Graduate']
predicted_outcomes = [prediction_labels[pred] for pred in predictions]

# Add predictions to the data
data['Prediction'] = predicted_outcomes

In [7]:
# Save the result to a CSV file
output_path = 'dataset/prediction.csv'
data.to_csv(output_path, index=False)

print(f"Predictions saved to {output_path}")

Predictions saved to dataset/prediction.csv


In [8]:
# Print total count of Graduate and Dropout predictions
prediction_counts = data['Prediction'].value_counts()
print(f"Total Graduate: {prediction_counts.get('Graduate', 0)}")
print(f"Total Dropout: {prediction_counts.get('Dropout', 0)}")

Total Graduate: 2730
Total Dropout: 1694


In [9]:
data.describe()

Unnamed: 0,Application mode,Application order,Course,Previous qualification (grade),Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Debtor,Tuition fees up to date,Scholarship holder,Age at enrollment,Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Unemployment rate,Inflation rate,GDP
count,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0
mean,4.992315,1.727848,8.8533,38.951175,9.83906,13.466546,24.516501,35.336347,126.978119,0.113698,0.880651,0.248418,23.265145,6.27057,8.299051,4.7066,10.640822,6.232143,8.063291,4.435805,10.230206,11.566139,1.228029,0.001969
std,4.981624,1.313793,4.419649,17.354827,7.260213,9.353207,6.692848,13.063382,14.482001,0.31748,0.324235,0.432144,7.587816,2.480178,4.179106,3.094238,4.843663,2.195951,3.947951,3.014764,5.210808,2.66385,1.382711,2.269935
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,95.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06
25%,0.0,1.0,5.0,29.0,3.0,7.0,24.0,37.0,117.9,0.0,1.0,0.0,19.0,5.0,6.0,3.0,11.0,5.0,6.0,2.0,10.75,9.4,0.3,-1.7
50%,4.0,1.0,9.0,39.0,12.0,15.0,25.0,40.0,126.1,0.0,1.0,0.0,20.0,6.0,8.0,5.0,12.285714,6.0,8.0,5.0,12.2,11.1,1.4,0.32
75%,9.0,2.0,12.0,50.0,17.0,22.0,29.0,42.0,134.8,0.0,1.0,0.0,25.0,7.0,10.0,6.0,13.4,7.0,10.0,6.0,13.333333,13.9,2.6,1.79
max,17.0,9.0,16.0,100.0,28.0,33.0,31.0,45.0,190.0,1.0,1.0,1.0,70.0,26.0,45.0,26.0,18.875,23.0,33.0,20.0,18.571429,16.2,3.7,3.51
