In [15]:
# Loading Data and checking the data sets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error

train_data = pd.read_csv("../data/raw/train.csv", index_col='id')


train_data.head(5)



Unnamed: 0_level_0,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.3
1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate,46.7
2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate,99.0
3,19,male,b.sc,2.0,49.5,yes,8.3,average,group study,high,moderate,63.9
4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy,100.0


In [11]:
#Further analysis of data
y = train_data["exam_score"]
X = train_data.drop(columns="exam_score")

#categorical columns
cat_col = [col for col in X.columns if(X[col].dtype == 'object')]

#numerical columns
num_col = [col for col in X.columns if(X[col].dtype == 'int64' or X[col].dtype == 'float64')]

ordinal_col = ["internet_access","facility_rating","exam_difficulty"]
nominal_col = ["gender", "course","sleep_quality","study_method"]

X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.2,random_state=1)

In [12]:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# processing columns

num_pipeline = Pipeline(steps=[("imputing_num",SimpleImputer(strategy='median')),
                               ("Scaling_values",StandardScaler())])

ordinal_pipeline = Pipeline(steps=[("impute_ordinal", SimpleImputer(strategy='most_frequent')),
                                   ("encoding_ordinal", OrdinalEncoder(categories=[['no','yes'],['low','medium','high'],['easy','moderate','hard']]))])

nominal_pipeline = Pipeline(steps=[("impute_nominal", SimpleImputer(strategy='most_frequent')),
                                   ("encoding_nominal", OneHotEncoder(handle_unknown='ignore', drop='first'))])

preprocess_trf = ColumnTransformer(transformers=[("numerical_process", num_pipeline, num_col),
                                                 ("ordinal_process", ordinal_pipeline, ordinal_col),
                                                 ("nominal_process",nominal_pipeline, nominal_col)])




In [13]:
# Applying Ridge Regression

ridge_reg = Ridge(alpha = 0.05)

ridge_pipe = Pipeline(steps=[("trf", preprocess_trf),
                             ("ridge_model", ridge_reg)])

ridge_pipe.fit(X_train, y_train)

pred = ridge_pipe.predict(X_valid)

In [16]:
rse = root_mean_squared_error(y_valid, pred)

print(rse)

8.899652508963147


In [18]:
#loading test data
test = pd.read_csv("../data/raw/test.csv", index_col='id')

ridge_pipe.fit(X,y)

final_pred = ridge_pipe.predict(test)

df = pd.DataFrame({'id' : test.index, "exam_score" : final_pred})

df.head(5)


Unnamed: 0,id,exam_score
0,630000,71.795324
1,630001,69.495807
2,630002,87.401828
3,630003,54.891568
4,630004,47.320667


In [21]:
df.to_csv("../sumissions/ridge_base.csv", index=False)