## Student Annual Percentage Predictor

In [1]:
import pandas as pd

data = pd.read_csv("final_processed_data.csv")
data.head()
data = data.drop(['CLASS'], axis=1) 

In [2]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# data.hist(bins=50, figsize=(20, 15))

# Train Test Splitting 

In [3]:
from sklearn.model_selection import train_test_split
train_set, test_set  = train_test_split(data, test_size=0.2, random_state=42)
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")

Rows in train set: 464
Rows in test set: 117



In [4]:
# now we don't have to use test set so we will copy train set to data
data = train_set.copy()
data.shape

(464, 6)

# Looking for correlation

In [5]:
corr_matrix = data.corr()
corr_matrix['PERCENTAGE'].sort_values(ascending=False)

PERCENTAGE          1.000000
SOCIAL_SCIENCE_P    0.862979
HINDI_P             0.852231
ENGLISH_P           0.840517
SCIENCE_P           0.818593
MATHEMATICS_P       0.783532
Name: PERCENTAGE, dtype: float64

###### since class is very less related so we will drop class column from our original data

# spliting train set into features and labels

In [6]:
print(data.shape)
data = train_set.drop("PERCENTAGE", axis=1)
print(data.shape)
data.head
data_labels = train_set["PERCENTAGE"].copy()


(464, 6)
(464, 5)


In [7]:
data.head()

Unnamed: 0,ENGLISH_P,HINDI_P,MATHEMATICS_P,SCIENCE_P,SOCIAL_SCIENCE_P
431,3.6,4.4,4.6,3.2,3.6
208,8.8,9.6,8.4,7.4,8.6
184,6.6,7.2,8.4,7.0,5.6
177,4.2,4.4,4.4,2.6,2.6
192,7.2,6.4,8.8,6.6,5.6


# Creating Pipelines

In [8]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    #     ..... add as many as you want in your pipeline
    ('std_scaler', StandardScaler()),  # using standardisation to feature scale our data
])

In [9]:
data_num_tr = my_pipeline.fit_transform(data)

In [10]:
data_num_tr.shape

(464, 5)

# Selecting a desired model for our data

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
model = LinearRegression()
# model = DecisionTreeRegressor()
# model = RandomForestRegressor()
print(data_num_tr)
model.fit(data_num_tr, data_labels)

[[-1.87199891 -1.21494845 -1.10466754 -1.59893248 -1.24876928]
 [ 0.79071344  1.35580278  0.7459261   0.35124878  0.92561133]
 [-0.3358187   0.16930221  0.7459261   0.16551723 -0.37901703]
 ...
 [ 0.4834774  -0.62169817  0.35632744 -0.20594587 -0.20506659]
 [ 1.30277351  0.76255249  1.13552476  1.0013092   1.27351223]
 [-0.02858266  0.46592735 -1.00726787 -0.57740896  0.75166088]]


LinearRegression()

# below code doesn't work because it lets to overfitting 


In [12]:
# some_data = data.iloc[85:90]
# some_labels = data_labels.iloc[85:90]
# prepared_data = my_pipeline.transform(some_data)
# model.predict(prepared_data)
# print(list(some_labels))


### so we will use other evaluation technique - CROSS VALIDATION

In [13]:
# 1 2 3 4 5 6 7 8 9 10
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, data_num_tr, data_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [14]:
rmse_scores

array([6.53924372, 6.95295413, 6.75950967, 6.26834527, 7.01427917,
       6.39611325, 5.29748675, 7.25245294, 5.41677802, 6.43096713])

In [15]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

print_scores(rmse_scores)

Scores: [6.53924372 6.95295413 6.75950967 6.26834527 7.01427917 6.39611325
 5.29748675 7.25245294 5.41677802 6.43096713]
Mean:  6.432813004691022
Standard deviation:  0.6122262842335163


# so our linear regression model works the best
### linear regressor > random forest regressor > decision tree

# Saving the model

In [16]:
from joblib import dump, load
dump(model, 'student.joblib')

['student.joblib']

## Testing the model on test data

In [21]:
from sklearn.metrics import mean_squared_error

X_test = test_set.drop("PERCENTAGE", axis=1)
Y_test = test_set["PERCENTAGE"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
# print(final_predictions, list(Y_test))

In [22]:
final_rmse

6.390959411947278

## Using the model

In [19]:
from joblib import dump, load
import numpy as np
model = load('student.joblib') 
periodic_marks = np.array([[10.0,8.0,1.0,1.0,1.0]])
feature = my_pipeline.transform(periodic_marks)



In [20]:
model.predict(feature)

array([46.07243497])