<a href="https://colab.research.google.com/github/Dima-salang/ai-ml-dl/blob/main/LinReg_TestScores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting Test Scores using Linear Regression



In [3]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import tensorflow as tf
import keras

# Load data
test_scores_csv = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/test_scores.csv').dropna()


# Define feature and label columns
feature_columns = ['Gender', 'EthnicGroup', 'ParentEduc', 'TestPrep', 'ParentMaritalStatus', 'IsFirstChild', 'NrSiblings', 'TransportMeans', 'WklyStudyHours']
label_columns = ['MathScore', 'ReadingScore', 'WritingScore']

# Extract features and labels
test_scores_features = test_scores_csv[feature_columns]
test_scores_labels = test_scores_csv[label_columns]

# Define transformers
categorical_features = ['Gender', 'EthnicGroup', 'ParentEduc', 'TestPrep', 'ParentMaritalStatus', 'IsFirstChild', 'WklyStudyHours', 'TransportMeans']
numeric_features = ['NrSiblings']

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(drop='first', sparse_output=False), categorical_features),
        ('numeric', StandardScaler(), numeric_features),
    ],
    remainder='passthrough'
)

model = LinearRegression()

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(test_scores_features, test_scores_labels, test_size=0.2)

print(test_scores_features.shape)
print(test_scores_labels.shape)
# Fit the model using the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the model
score = pipeline.score(X_test, y_test)
prediction = pipeline.predict(X_test)
print("Prediction: ", prediction)
print("True label: ", y_test)

print("R-squared score:", score)
mse = mean_squared_error(y_test, prediction)
print("MSE: ", mse)



(19243, 9)
(19243, 3)
Prediction:  [[65.29234698 74.05578852 73.82898441]
 [70.18735776 79.04771199 80.85422916]
 [65.67108971 62.3084339  59.13791655]
 ...
 [63.25280656 60.34513154 56.19390011]
 [79.50568783 73.66028156 71.28532619]
 [71.24085065 76.43088027 74.80061358]]
True label:         MathScore  ReadingScore  WritingScore
23775         59            62            57
10638         69            77            77
17851         88            84            84
20871         82            96            91
9814          73            82            76
...          ...           ...           ...
4293          71            78            75
2001          81            82            74
16746         83            81            72
18202         71            62            69
42            48            55            52

[3849 rows x 3 columns]
R-squared score: 0.18936199693196243
MSE:  183.2279091286135
