# Multiple Linear Regression

## Importing the libraries

In [3]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer

import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
from math import sqrt

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from sklearn.preprocessing import OneHotEncoder

## START THE PROCESS OF FEATURE SELECTION HERE

### Import the Data

In [4]:


# Load dataset

Math = pd.read_csv('C:/Tejinder/SDS/Student Performance Analysis/student+performance/student/student-mat.csv', sep=';', header=0)  # Header is in the first row
Portug = pd.read_csv('C:/Tejinder/SDS/Student Performance Analysis/student+performance/student//student-por.csv', sep=';', header=0)  # Header is in the first row

print (Math.dtypes)
print(Portug.dtypes)




school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
dtype: object
school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64

### Identify the Categorical and Numerical Columns to enable Label enconding

In [5]:
# data = pd.read_csv('student-mat.csv', sep=';')  # Replace with your file path
categorical_features_indices = Math.select_dtypes(include=['object', 'category'])

# Drop G3 (target) from numerical features
numerical_features_indices = [Math.columns[i] for i in range(Math.shape[1]) 
                              if i not in categorical_features_indices and Math.columns[i] != 'G3']

all_feature_names = list (categorical_features_indices ) + list( numerical_features_indices )
print(all_feature_names)

['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2']


### Label Encoding

In [6]:
# Preprocessing: Encode categorical features

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoders = {}

# Apply LabelEncoder to each categorical column
for col in categorical_features_indices:
    le = LabelEncoder()
    Math[col] = le.fit_transform(Math[col])
    label_encoders[col] = le  # Store the encoder for future use if needed

print("\nDataFrame after Label Encoding:")
print(Math)



DataFrame after Label Encoding:
     school  sex  age  address  famsize  Pstatus  Medu  Fedu  Mjob  Fjob  ...  \
0         0    0   18        1        0        0     4     4     0     4  ...   
1         0    0   17        1        0        1     1     1     0     2  ...   
2         0    0   15        1        1        1     1     1     0     2  ...   
3         0    0   15        1        0        1     4     2     1     3  ...   
4         0    0   16        1        0        1     3     3     2     2  ...   
..      ...  ...  ...      ...      ...      ...   ...   ...   ...   ...  ...   
390       1    1   20        1        1        0     2     2     3     3  ...   
391       1    1   17        1        1        1     3     1     3     3  ...   
392       1    1   21        0        0        1     1     1     2     2  ...   
393       1    1   18        0        1        1     3     2     3     2  ...   
394       1    1   19        1        1        1     1     1     2     0  ..

### Dividing the Dataframe into X and y as well as Test and Train

In [25]:
# Importing the dataset
X = Math.iloc[:, :-1]
y = Math.iloc[:, -1]


# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train_all, X_test_all, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
#print(X_train)
#print(X_test)
#print(y_train)
#print(y_test)
columns_to_keep = ['absences','higher','famrel','G1','G2']
X_train = X_train_all[columns_to_keep]
X_test=X_test_all[columns_to_keep]
print(X_train)
print(X_test)

     absences  higher  famrel  G1  G2
23          0       1       5  13  13
296         0       1       2  10   9
13          2       1       5  10  10
249         0       1       4  13  15
61          6       1       5  10   8
..        ...     ...     ...  ..  ..
203        18       1       5   7   6
255         2       1       4   7   9
72          2       1       3   8   6
235        10       1       5  11   9
37          7       1       2  15  16

[316 rows x 5 columns]
     absences  higher  famrel  G1  G2
146         0       1       3   6   7
379        17       1       4  10  10
247        16       0       5   6   8
197         8       1       3   9   9
368         0       1       5  11  10
..        ...     ...     ...  ..  ..
41          8       1       5  12  12
204         6       1       5  10  10
362         0       1       4  11  11
233         2       1       4  14  13
275         6       1       4  12  12

[79 rows x 5 columns]


## XGBOOST and model performance


In [26]:
# Training XGBoost on the Training set
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error

xgboostmodel = XGBRegressor(n_estimators=50, learning_rate=0.1)
xgboostmodel.fit(X_train, y_train)
y_pred = xgboostmodel.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")
mse = mean_squared_error(y_test, y_pred)
print(f"Test Mean Squared Error: {mse}")
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"MAPE: {mape * 100:.2f}%")
# Save the trained model
#joblib.dump(xgboostmodel, 'xgboost_model.pkl')

R-squared: 0.84555983543396
Test Mean Squared Error: 2.111481695476064
MAPE: 67982752951148240.00%


In [27]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import KFold, cross_val_score,GridSearchCV 
from sklearn.metrics import make_scorer, r2_score, mean_absolute_percentage_error

# Define custom MAPE scoring function
def mape_scorer(y_true, y_pred):
    return mean_absolute_percentage_error(y_true, y_pred)
# Set up K-Fold Cross Validation
kf = KFold(n_splits=10, shuffle=True, random_state=0)
r2_scores = cross_val_score(xgboostmodel, X, y, cv=kf, scoring='r2')

# Print out the results
print(f"R-squared scores for each fold: {r2_scores}")
print(f"Average R-squared: {r2_scores.mean():.4f}")


R-squared scores for each fold: [0.8982318  0.83618879 0.81873196 0.93465918 0.76103693 0.85769647
 0.68523324 0.87543714 0.92748076 0.90814197]
Average R-squared: 0.8503


In [28]:
# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Set up GridSearchCV with KFold cross-validation
grid_search = GridSearchCV(estimator=xgboostmodel,
                           param_grid=param_grid,
                           scoring='r2',  # R-squared as the scoring metric
                           cv=kf,
                           verbose=1,  # Print progress
                           n_jobs=-1)  # Use all available cores

# Fit the model using GridSearchCV
grid_search.fit(X, y)

# Print the best hyperparameters found by GridSearchCV
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Best model after tuning
best_model = grid_search.best_estimator_

# Evaluate the model using K-Fold Cross Validation (R-squared and MAPE)
r2_scores = cross_val_score(best_model, X, y, cv=kf, scoring='r2')
mape_scores = cross_val_score(best_model, X, y, cv=kf, scoring=make_scorer(mape_scorer))

# Print out the results
print(f"R-squared scores for each fold: {r2_scores}")
print(f"Average R-squared: {r2_scores.mean():.4f}")

print(f"MAPE scores for each fold: {mape_scores}")
print(f"Average MAPE: {mape_scores.mean() * 100:.2f}%")

Fitting 10 folds for each of 243 candidates, totalling 2430 fits
Best Hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
R-squared scores for each fold: [0.85629171 0.85545176 0.85195452 0.92278898 0.83223325 0.9000237
 0.79977494 0.87236077 0.96696502 0.91651487]
Average R-squared: 0.8774
MAPE scores for each fold: [1.67497025e+15 2.01631251e+15 1.77098587e+15 2.90484275e+14
 7.28889290e+14 3.89982032e+14 9.30767208e+14 8.22539353e+14
 2.48223412e+14 1.16249954e+15]
Average MAPE: 100356537405165232.00%
