In [1]:
import pandas as pd                                                                             # type: ignore
from sklearn.model_selection import train_test_split                                            # type: ignore
from sklearn.preprocessing import PolynomialFeatures                                            # type: ignore
from sklearn.metrics import mean_absolute_error , mean_squared_error , median_absolute_error    # type: ignore
from sklearn.linear_model import LinearRegression                                               # type: ignore

dataset = pd.read_csv('../datasets/stag.csv')
dataset.head(10)

Unnamed: 0,high_GPA,math_SAT,verb_SAT,comp_GPA,univ_GPA
0,3.45,643,589,3.76,3.52
1,2.78,558,512,2.87,2.91
2,2.52,583,503,2.54,2.4
3,3.67,685,602,3.83,3.47
4,3.24,592,538,3.29,3.47
5,2.1,562,486,2.64,2.37
6,2.82,573,548,2.86,2.4
7,2.36,559,536,2.03,2.24
8,2.42,552,583,2.81,3.02
9,3.51,617,591,3.41,3.32


In [2]:
X = dataset.iloc[:,:-1] 
y = dataset.iloc[:, -1]

print(f"X >> \n{X.head(10)}\n")
print(f"y >> \n{y.head(10)}")

X >> 
   high_GPA  math_SAT  verb_SAT  comp_GPA
0      3.45       643       589      3.76
1      2.78       558       512      2.87
2      2.52       583       503      2.54
3      3.67       685       602      3.83
4      3.24       592       538      3.29
5      2.10       562       486      2.64
6      2.82       573       548      2.86
7      2.36       559       536      2.03
8      2.42       552       583      2.81
9      3.51       617       591      3.41

y >> 
0    3.52
1    2.91
2    2.40
3    3.47
4    3.47
5    2.37
6    2.40
7    2.24
8    3.02
9    3.32
Name: univ_GPA, dtype: float64


In [3]:
# Splitting the dataset into the Training set and Test set:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print(f"X_train shape = {X_train.shape} , X_train >> \n{X_train}\n========")
print(f"X_test  shape = {X_test.shape}  , X_test  >> \n{X_test}\n========")
print(f"y_train shape = {y_train.shape} , y_train >> \n{y_train}\n========")
print(f"y_test  shape = {y_test.shape}  , y_test  >> \n{y_test}")

X_train shape = (84, 4) , X_train >> 
     high_GPA  math_SAT  verb_SAT  comp_GPA
68       3.29       692       563      3.17
3        3.67       685       602      3.83
74       3.48       692       698      3.54
82       3.72       621       589      4.00
75       3.62       684       609      3.48
..        ...       ...       ...       ...
101      3.49       692       683      3.27
67       2.24       554       542      2.07
64       3.94       691       645      3.98
47       2.66       607       528      2.94
44       2.65       604       617      3.31

[84 rows x 4 columns]
X_test  shape = (21, 4)  , X_test  >> 
     high_GPA  math_SAT  verb_SAT  comp_GPA
26       2.64       580       538      2.51
61       3.24       643       607      3.24
2        2.52       583       503      2.54
62       3.29       608       649      3.53
85       3.28       651       640      3.32
48       3.21       619       573      2.84
16       3.91       703       684      3.84
100      3.76       

In [4]:
# Fitting Polynomial Regression to the dataset:
poly_reg = PolynomialFeatures(degree = 2)
X_train2 = poly_reg.fit_transform(X_train)
X_test2  = poly_reg.fit_transform(X_test)

print(f"X_train2.shape >> {X_train2.shape} , X_train2 : \n{X_train2[:2]}\n")
print(f"X_test2.shape  >> {X_test2.shape}  , X_test2  : \n{X_test2[:2]}")

X_train2.shape >> (84, 15) , X_train2 : 
[[1.00000e+00 3.29000e+00 6.92000e+02 5.63000e+02 3.17000e+00 1.08241e+01
  2.27668e+03 1.85227e+03 1.04293e+01 4.78864e+05 3.89596e+05 2.19364e+03
  3.16969e+05 1.78471e+03 1.00489e+01]
 [1.00000e+00 3.67000e+00 6.85000e+02 6.02000e+02 3.83000e+00 1.34689e+01
  2.51395e+03 2.20934e+03 1.40561e+01 4.69225e+05 4.12370e+05 2.62355e+03
  3.62404e+05 2.30566e+03 1.46689e+01]]

X_test2.shape  >> (21, 15)  , X_test2  : 
[[1.00000e+00 2.64000e+00 5.80000e+02 5.38000e+02 2.51000e+00 6.96960e+00
  1.53120e+03 1.42032e+03 6.62640e+00 3.36400e+05 3.12040e+05 1.45580e+03
  2.89444e+05 1.35038e+03 6.30010e+00]
 [1.00000e+00 3.24000e+00 6.43000e+02 6.07000e+02 3.24000e+00 1.04976e+01
  2.08332e+03 1.96668e+03 1.04976e+01 4.13449e+05 3.90301e+05 2.08332e+03
  3.68449e+05 1.96668e+03 1.04976e+01]]


In [5]:
# No Polynomial for y:
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_train2, y_train )

y_pred2 = lin_reg_2.predict(X_test2) 

In [7]:
# Calculating Mean Absolute Error:
MAEValue = mean_absolute_error(y_test, y_pred2, multioutput='uniform_average')   # it can be raw_values
print(f'Mean   Absolute Error Value : {MAEValue}')
##########################################################
#Calculating Mean Squared Error
MSEValue = mean_squared_error(y_test, y_pred2, multioutput='uniform_average')    # it can be raw_values
print(f'Mean   Squared  Error Value : {MSEValue}')
##########################################################
#Calculating Median Squared Error
MdSEValue = median_absolute_error(y_test, y_pred2)
print(f'Median Squared  Error Value : {MdSEValue}')

Mean   Absolute Error Value : 0.10675162816413909
Mean   Squared  Error Value : 0.017286193500888718
Median Squared  Error Value : 0.08555882165605366
