In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

file_path = '/content/drive/MyDrive/StudentsPerformance.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Dataset loaded successfully from {file_path}")
    display(df.head())
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    print("Please double-check the file path and ensure it exists in your Google Drive.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Dataset loaded successfully from /content/drive/MyDrive/StudentsPerformance.csv


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


## Data preparation

### Subtask:
Split the dataset into features (X) and the target variable (y). Then, split the data into training and testing sets.


In [3]:
X = df.drop('math score', axis=1)
y = df['math score']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (800, 7)
Shape of X_test: (200, 7)
Shape of y_train: (800,)
Shape of y_test: (200,)


## Baseline model

### Subtask:
Calculate a simple baseline prediction, such as the mean of the target variable in the training set. Evaluate the baseline using MAE.


In [4]:
import numpy as np
from sklearn.metrics import mean_absolute_error

baseline_prediction = y_train.mean()

baseline_test_predictions = np.full(shape=y_test.shape, fill_value=baseline_prediction)

baseline_mae = mean_absolute_error(y_test, baseline_test_predictions)

print(f"Baseline MAE: {baseline_mae}")

Baseline MAE: 12.339850000000002


## Model training

### Subtask:
Train a linear regression model using the training data.


In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Identify categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=np.number).columns

# Create a column transformer for one-hot encoding categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='passthrough') # Keep numerical features

# Create a pipeline with preprocessing and linear regression
model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', LinearRegression())])

# Fit the model to the training data
model.fit(X_train, y_train)

print("Linear Regression model trained successfully.")

Linear Regression model trained successfully.


## Model evaluation

### Subtask:
Evaluate the trained model on both the training and testing sets using appropriate metrics for regression, such as Mean Absolute Error (MAE), Mean Squared Error (MSE), or R-squared.


In [6]:
from sklearn.metrics import mean_absolute_error

# Make predictions on the training data
y_train_pred = model.predict(X_train)

# Make predictions on the testing data
y_test_pred = model.predict(X_test)

# Calculate MAE for the training set
train_mae = mean_absolute_error(y_train, y_train_pred)

# Calculate MAE for the testing set
test_mae = mean_absolute_error(y_test, y_test_pred)

# Print the calculated MAE values
print(f"Training MAE: {train_mae}")
print(f"Testing MAE: {test_mae}")

Training MAE: 4.266711846071958
Testing MAE: 4.214763142474854


## Comparison

### Subtask:
Compare the performance of the trained model with the baseline model.


In [7]:
print(f"Baseline MAE: {baseline_mae}")
print(f"Linear Regression Test MAE: {test_mae}")

if test_mae < baseline_mae:
    print("\nThe linear regression model performed better than the simple baseline model.")
elif test_mae > baseline_mae:
    print("\nThe linear regression model performed worse than the simple baseline model.")
else:
    print("\nThe linear regression model performed similarly to the simple baseline model.")

Baseline MAE: 12.339850000000002
Linear Regression Test MAE: 4.214763142474854

The linear regression model performed better than the simple baseline model.


## Summary:

### Data Analysis Key Findings

* The dataset was split into training (80%) and testing (20%) sets, with 800 samples for training and 200 for testing.
* A simple baseline model, predicting the mean of the training target variable, achieved a Mean Absolute Error (MAE) of approximately 12.34 on the test set.
* A linear regression model, incorporating one-hot encoding for categorical features, was trained on the training data.
* The trained linear regression model achieved a training MAE of approximately 4.27 and a testing MAE of approximately 4.21.
* The linear regression model performed significantly better than the baseline model, with a test MAE of 4.21 compared to the baseline MAE of 12.34.

### Insights

* The linear regression model provides a substantial improvement in predicting 'math score' compared to simply using the average score.
* The training MAE (4.27) and testing MAE (4.21) are very close, indicating that the model is not overfitting to the training data and generalizes well to unseen data.

