In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier,GradientBoostingClassifier,GradientBoostingRegressor,AdaBoostClassifier,AdaBoostRegressor
from sklearn.metrics import accuracy_score,r2_score,mean_squared_error,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from xgboost import XGBClassifier,XGBRFRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

import tensorflow
from tensorflow import keras
from keras.layers import Dense,BatchNormalization,Dropout
from keras import Sequential

In [3]:
dataset = pd.read_csv(r"k-12.csv")

In [4]:
dataset.columns

Index(['Name', 'Age', 'Gender', 'Country', 'State', 'City',
       'Parental_Education_Level', 'Earning Class', 'Level', 'Course Name',
       'Course Level', 'Material Level', 'Previous_Scores', 'Assesment Score',
       'IQ', 'Attendance', 'Material Name', 'recommended_material',
       'Study Time', 'promoted'],
      dtype='object')

In [5]:
dataset = dataset.drop(columns=['Name','Country','State','City','Course Name','promoted','Material Name','recommended_material'])

In [6]:
dataset.sample()

Unnamed: 0,Age,Gender,Parental_Education_Level,Earning Class,Level,Course Level,Material Level,Previous_Scores,Assesment Score,IQ,Attendance,Study Time
1890,12,Male,High School,Low,Middle School,Medium,Medium,60,67,87.3,88,4.02


In [7]:
dataset['Course Level'].value_counts()

Course Level
Medium    3872
High      1920
Low        651
Name: count, dtype: int64

In [8]:
x = dataset.drop(columns=['Assesment Score'])
y = dataset['Assesment Score']

In [9]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [10]:
x_train

Unnamed: 0,Age,Gender,Parental_Education_Level,Earning Class,Level,Course Level,Material Level,Previous_Scores,IQ,Attendance,Study Time
2596,6,Female,Postgraduate,Low,Kindergarten,Medium,Medium,89,102.15,86,4.14
2337,9,Female,High School,High,Elementary,High,Medium,89,99.45,69,3.96
4883,16,Female,College,Low,High School,Medium,Low,52,84.60,96,4.08
6300,13,Female,College,High,Middle School,Medium,High,75,95.85,93,4.14
151,14,Male,High School,Low,High School,Medium,High,85,99.45,92,4.08
...,...,...,...,...,...,...,...,...,...,...,...
3772,13,Male,Postgraduate,Low,Middle School,Medium,High,61,92.25,91,4.32
5191,12,Female,Postgraduate,Low,Middle School,Medium,Medium,53,86.85,91,4.20
5226,8,Female,High School,Medium,Elementary,High,Medium,75,90.45,63,3.78
5390,6,Male,College,Medium,Kindergarten,Low,High,78,98.10,97,4.20


In [11]:
# Define the ColumnTransformer
transformer = ColumnTransformer(
    transformers=[
        ('tf1', OneHotEncoder(drop='first', sparse_output=False), ['Gender']),
        ('tf2', OrdinalEncoder(categories=[
            ['High School', 'College', 'Postgraduate'],  # Parental Education Level
            ['Low', 'Medium', 'High'],  # Earning Class
            ['Kindergarten', 'Elementary', 'Middle School', 'High School'],  # Level
            ['Low', 'Medium', 'High'],  # Course Level
            ['Low', 'Medium', 'High']  # Material Level
        ]), ['Parental_Education_Level', 'Earning Class', 'Level', 'Course Level', 'Material Level'])
    ],
    remainder='passthrough'  # Keep other columns as is
)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', transformer),
    ('scaler', StandardScaler()),  
    ('Regressor', RandomForestRegressor(n_estimators=100, random_state=42)) 
])



In [12]:
pipeline.fit(x_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [13]:
y_pred = pipeline.predict(x_test)
r2_score(y_test,y_pred)

0.9999447572760779

In [14]:
mean_squared_error(y_test,y_pred)

0.0008088440651667923

In [15]:
# Predict Assesment score of student
input_data = pd.DataFrame([[
    6, 'Female', 'Postgraduate', 'Low', 'Kindergarten', 
    'Medium', 'Medium', 89, 102.15, 86, 4.10
]], columns=x_train.columns)

# Make prediction
prediction = pipeline.predict(input_data)
print(f" Assesment score of the student will be {prediction[0]}")

 Assesment score of the student will be 68.0


In [None]:
# import pickle
# pickle.dump(pipeline,open('Assesment score.pkl','wb'))