# Data preprocessing

## Import the libraries

In [97]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [90]:

#? import dataset
dataset = pd.read_csv('./data/KIT_Student_Performance.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

print(X.shape, y.shape)

(814, 4) (814,)


In [91]:

#- use One-Hot Encoding to converts these text categories into numerical '0's and '1's.
ct = ColumnTransformer(
	transformers=[('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False), [0,1,2,3])],
	remainder='passthrough'
)
X = ct.fit_transform(X)

print(X.shape)

(814, 28)


In [92]:

#? Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [93]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [94]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[78.75 81.  ]
 [70.23 68.  ]
 [78.75 66.  ]
 [76.29 84.  ]
 [91.58 92.  ]
 [73.   90.  ]
 [93.5  93.  ]
 [76.29 90.  ]
 [92.97 78.  ]
 [91.38 93.  ]
 [93.76 99.  ]
 [78.75 74.  ]
 [92.97 88.  ]
 [73.   85.  ]
 [76.29 65.  ]
 [69.14 63.  ]
 [92.97 95.  ]
 [86.5  90.  ]
 [82.2  95.  ]
 [91.58 83.  ]
 [91.06 91.  ]
 [93.76 99.  ]
 [78.21 81.  ]
 [76.06 84.  ]
 [93.76 88.  ]
 [92.31 94.  ]
 [74.34 74.  ]
 [93.5  95.  ]
 [93.5  93.  ]
 [86.5  86.  ]
 [82.27 92.88]
 [82.27 78.48]
 [69.14 69.  ]
 [74.34 89.  ]
 [74.34 61.  ]
 [92.31 91.  ]
 [76.06 79.  ]
 [91.58 97.  ]
 [93.76 97.  ]
 [93.5  93.  ]
 [93.76 98.  ]
 [74.34 74.  ]
 [74.34 71.  ]
 [91.06 89.  ]
 [92.97 99.  ]
 [93.5  93.  ]
 [91.58 91.  ]
 [73.   72.  ]
 [72.1  78.  ]
 [70.23 79.  ]
 [65.32 64.  ]
 [91.38 94.  ]
 [78.21 72.  ]
 [65.32 67.  ]
 [70.23 74.  ]
 [91.38 93.  ]
 [82.27 81.28]
 [86.5  81.  ]
 [70.23 69.  ]
 [91.58 94.  ]
 [65.32 60.  ]
 [78.21 61.  ]
 [91.06 92.  ]
 [89.53 89.  ]
 [69.14 73.  ]
 [89.53 87.  ]
 [76.29 75

In [None]:

# ? Let's compare our predictions (y_pred) with the actual scores (y_test).
# Mean Absolute Error (MAE): Average absolute difference between predicted and actual. Lower is better.
mae = mean_absolute_error(y_test, y_pred)
# Mean Squared Error (MSE): Average squared difference. Penalizes larger errors more. Lower is better.
mse = mean_squared_error(y_test, y_pred)
# Root Mean Squared Error (RMSE): Square root of MSE. Easier to interpret as it's in the same units as scores. Lower is better.
rmse = np.sqrt(mse) # Or use squared=False in mean_squared_error
# R-squared (R2): Explains how much of the variation in y (scores) can be explained by X (features). Closer to 1 is better.
r2 = r2_score(y_test, y_pred)

print("\n--- Our Model's Performance Metrics ---")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")


--- Our Model's Performance Metrics ---
Mean Absolute Error (MAE): 5.20
Mean Squared Error (MSE): 49.01
Root Mean Squared Error (RMSE): 7.00
R-squared (R2): 0.59


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


#? import dataset
dataset = pd.read_csv('./data/KIT_Student_Performance.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

print(X.shape, y.shape)


#- use One-Hot Encoding to converts these text categories into numerical '0's and '1's.
ct = ColumnTransformer(
	transformers=[('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False), [0,1,2,3])],
	remainder='passthrough'
)
X = ct.fit_transform(dataset)

print(X.shape)


#? Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

(814, 4) (814,)
(814, 28)
[[78.75 81.  ]
 [70.23 68.  ]
 [78.75 66.  ]
 [76.29 84.  ]
 [91.58 92.  ]
 [73.   90.  ]
 [93.5  93.  ]
 [76.29 90.  ]
 [92.97 78.  ]
 [91.38 93.  ]
 [93.76 99.  ]
 [78.75 74.  ]
 [92.97 88.  ]
 [73.   85.  ]
 [76.29 65.  ]
 [69.14 63.  ]
 [92.97 95.  ]
 [86.5  90.  ]
 [82.2  95.  ]
 [91.58 83.  ]
 [91.06 91.  ]
 [93.76 99.  ]
 [78.21 81.  ]
 [76.06 84.  ]
 [93.76 88.  ]
 [92.31 94.  ]
 [74.34 74.  ]
 [93.5  95.  ]
 [93.5  93.  ]
 [86.5  86.  ]
 [82.27 92.88]
 [82.27 78.48]
 [69.14 69.  ]
 [74.34 89.  ]
 [74.34 61.  ]
 [92.31 91.  ]
 [76.06 79.  ]
 [91.58 97.  ]
 [93.76 97.  ]
 [93.5  93.  ]
 [93.76 98.  ]
 [74.34 74.  ]
 [74.34 71.  ]
 [91.06 89.  ]
 [92.97 99.  ]
 [93.5  93.  ]
 [91.58 91.  ]
 [73.   72.  ]
 [72.1  78.  ]
 [70.23 79.  ]
 [65.32 64.  ]
 [91.38 94.  ]
 [78.21 72.  ]
 [65.32 67.  ]
 [70.23 74.  ]
 [91.38 93.  ]
 [82.27 81.28]
 [86.5  81.  ]
 [70.23 69.  ]
 [91.58 94.  ]
 [65.32 60.  ]
 [78.21 61.  ]
 [91.06 92.  ]
 [89.53 89.  ]
 [69.14 73.  ]

0.5875012740503857

In [6]:



new_input_data_2 = pd.DataFrame([[
    'Java with Functional Programming', # Subject
    'Semester 2',                      # Semester
    'Core Subject',                    # Type of Subject
    'Theoretical'                      # Nature of Subject
]], columns=['Subject', 'Semester', 'Type of Subject', 'Nature of Subject'])

print("\nNew raw input data 2:")
print(new_input_data_2.to_markdown(index=False, numalign="left", stralign="left"))

new_input_processed_2 = ct.transform(new_input_data_2)
predicted_score_2 = regressor.predict(new_input_processed_2)

print(f"Predicted Score for new input 2: {predicted_score_2[0]:.2f}")


New raw input data 2:
| Subject                          | Semester   | Type of Subject   | Nature of Subject   |
|:---------------------------------|:-----------|:------------------|:--------------------|
| Java with Functional Programming | Semester 2 | Core Subject      | Theoretical         |
Predicted Score for new input 2: 69.94


In [9]:
new_input_data_2 = pd.DataFrame([[
    'Mathematics II: Modern Algebra and Number Theory', # Subject
    'Semester 2',                      # Semester
    'Core Subject',                    # Type of Subject
    'Practical'                      # Nature of Subject
]], columns=['Subject', 'Semester', 'Type of Subject', 'Nature of Subject'])

print("\nNew raw input data 2:")
print(new_input_data_2.to_markdown(index=False, numalign="left", stralign="left"))

new_input_processed_2 = ct.transform(new_input_data_2)
predicted_score_2 = regressor.predict(new_input_processed_2)

print(f"Predicted Score for new input 2: {predicted_score_2[0]:.2f}")


New raw input data 2:
| Subject                                          | Semester   | Type of Subject   | Nature of Subject   |
|:-------------------------------------------------|:-----------|:------------------|:--------------------|
| Mathematics II: Modern Algebra and Number Theory | Semester 2 | Core Subject      | Practical           |
Predicted Score for new input 2: 75.36
