In [1]:
#import libraries
import numpy as np
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

In [2]:
#importing and resding data
ds=pd.read_csv('dropout_risk_cleaned.csv')
ds=ds.loc[:,~ds.columns.str.contains('^unnamed')]
ds

Unnamed: 0.1,Unnamed: 0,current_gpa,attendance_rate,study_hours_per_week,financial_stress,mental_health_score,part_time_job,parental_support,dropout_risk
0,0,3.85,81.6,15.9,6,6,1,low,0.39
1,1,4.00,58.1,18.5,5,5,1,high,0.33
2,2,2.45,78.7,18.0,6,2,0,high,0.39
3,3,1.59,66.3,27.2,1,7,0,high,0.30
4,4,3.23,64.9,26.7,9,10,0,medium,0.37
...,...,...,...,...,...,...,...,...,...
495,495,2.69,87.5,2.1,9,2,1,medium,0.51
496,496,3.38,41.8,15.9,10,5,0,low,0.56
497,497,3.14,82.3,4.0,10,8,0,medium,0.38
498,498,3.67,62.7,10.4,2,9,0,medium,0.22


In [3]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            500 non-null    int64  
 1   current_gpa           500 non-null    float64
 2   attendance_rate       500 non-null    float64
 3   study_hours_per_week  500 non-null    float64
 4   financial_stress      500 non-null    int64  
 5   mental_health_score   500 non-null    int64  
 6   part_time_job         500 non-null    int64  
 7   parental_support      500 non-null    object 
 8   dropout_risk          500 non-null    float64
dtypes: float64(4), int64(4), object(1)
memory usage: 35.3+ KB


In [4]:
#splitting data into independent and dependent variables

x=ds[['current_gpa', 'attendance_rate', 'study_hours_per_week','financial_stress', 'mental_health_score', 'part_time_job',
       'parental_support']]
y=ds[['dropout_risk']]

In [5]:
x

Unnamed: 0,current_gpa,attendance_rate,study_hours_per_week,financial_stress,mental_health_score,part_time_job,parental_support
0,3.85,81.6,15.9,6,6,1,low
1,4.00,58.1,18.5,5,5,1,high
2,2.45,78.7,18.0,6,2,0,high
3,1.59,66.3,27.2,1,7,0,high
4,3.23,64.9,26.7,9,10,0,medium
...,...,...,...,...,...,...,...
495,2.69,87.5,2.1,9,2,1,medium
496,3.38,41.8,15.9,10,5,0,low
497,3.14,82.3,4.0,10,8,0,medium
498,3.67,62.7,10.4,2,9,0,medium


In [6]:
y

Unnamed: 0,dropout_risk
0,0.39
1,0.33
2,0.39
3,0.30
4,0.37
...,...
495,0.51
496,0.56
497,0.38
498,0.22


In [7]:
#implement OneHotEncoder

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
ohe=OneHotEncoder()
ohe.fit(x[["parental_support"]])

In [8]:
#column transformer

ct=make_column_transformer((OneHotEncoder(handle_unknown='ignore',categories=ohe.categories_),["parental_support"]),remainder='passthrough',force_int_remainder_cols=False,sparse_threshold=0)
ct

In [9]:
#fitting random forest regfession model
from sklearn.ensemble import RandomForestRegressor
reg=RandomForestRegressor(n_estimators=10,random_state=0)

In [10]:
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(ct,reg)
pipe

In [11]:
#splitting dataset into training and testing


from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
scores=[]
for i in range(0,101):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.10,random_state=i)
    pipe.fit(x_train,y_train)
    result=pipe.predict(x_test)
    score=r2_score(y_test,result)
    scores.append(score)

In [12]:
#finding best value

bestindex=np.argmax(scores)
scores[bestindex]

0.9109731957702788

In [13]:
ds.columns

Index(['Unnamed: 0', 'current_gpa', 'attendance_rate', 'study_hours_per_week',
       'financial_stress', 'mental_health_score', 'part_time_job',
       'parental_support', 'dropout_risk'],
      dtype='object')

In [17]:
ds.head(5)

Unnamed: 0.1,Unnamed: 0,current_gpa,attendance_rate,study_hours_per_week,financial_stress,mental_health_score,part_time_job,parental_support,dropout_risk
0,0,3.85,81.6,15.9,6,6,1,low,0.39
1,1,4.0,58.1,18.5,5,5,1,high,0.33
2,2,2.45,78.7,18.0,6,2,0,high,0.39
3,3,1.59,66.3,27.2,1,7,0,high,0.3
4,4,3.23,64.9,26.7,9,10,0,medium,0.37


In [18]:
#for user input
current_gpa = float(input("Enter current GPA (0.0–4.0): "))
attendance_rate = float(input("Enter attendance rate (0–100): "))
study_hours_per_week = float(input("Enter study hours per week: "))
financial_stress = int(input("Enter financial stress (1–10): "))
mental_health_score = int(input("Enter mental health score (1–10): "))
part_time_job = int(input("Do you have a part-time job? (Yes=1, No=0): "))
parental_support = input("Enter parental support (low/medium/high): ").strip().lower()


columns = ['current_gpa', 'attendance_rate', 'study_hours_per_week',
           'financial_stress', 'mental_health_score', 'part_time_job',
           'parental_support']

myinput = pd.DataFrame([[current_gpa, attendance_rate, study_hours_per_week,
                         financial_stress, mental_health_score, part_time_job,
                         parental_support]], columns=columns)

result = pipe.predict(myinput)

print("Predicted dropout risk (0–1 scale):", round(result[0], 3))

Enter current GPA (0.0–4.0):  4
Enter attendance rate (0–100):  60
Enter study hours per week:  20
Enter financial stress (1–10):  6
Enter mental health score (1–10):  5
Do you have a part-time job? (Yes=1, No=0):  0
Enter parental support (low/medium/high):  medium


Predicted dropout risk (0–1 scale): 0.351


In [19]:
import pickle as pkl

In [21]:
pkl.dump(pipe,open("dropout_risk.pkl","wb"))