# Test Scores Toy Data

In [None]:
import pandas as pd
import os
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.options.display.max_rows = 20  # Shows 20 rows
pd.options.display.max_columns = None  # Shows All columns

## Loading the dataset

In [None]:
# These lines would load the data locally
# data_root = "./"
# filename = "Life_Expectancy_Data.csv"
# filepath = os.path.join(data_root, filename)
# df = pd.read_csv(filepath)

# We'll fetch it directly from the web
data_url = "https://aet-cs.github.io/white/ML/data/test-scores.csv"
df = pd.read_csv(data_url)
df

`describe` gives a quick overview of each feature

In [None]:
df.describe()

## Data Exploration

Show all the columns. Target is 'final_exam_score'

In [None]:
df.columns

Get the size of the dataframe. Shape returns (rows, cols)

In [None]:
target = "final_exam_score"

In [None]:
df.shape

Let's get all the data types

In [None]:
df.dtypes

In [None]:
df.hist(target);

In [None]:
df.hist(figsize=(15,15));

### Individual Plots

In [None]:
df.columns

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np

def plot_X_vs_Y(col1_name, col2_name, col1_label, col2_label):
        
    # Load the data
    df = pd.read_csv(data_url)
    
    # Extract variables
    X = df[[col1_name]].values  # needs to be 2D for sklearn
    y = df[col2_name].values
    
    # Fit linear regression
    model = LinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)
    
    # Get statistics
    r2 = r2_score(y, y_pred)
    slope = model.coef_[0]
    intercept = model.intercept_
    
    print(f"Regression Statistics:")
    print(f"  Equation: y = {slope:.3f}x + {intercept:.3f}")
    print(f"  R² = {r2:.3f}")
    
    # Create plot
    plt.figure(figsize=(10, 6))
    plt.scatter(df[col1_name], df[col2_name], 
                alpha=0.6, s=50, label='Students')
    plt.plot(df[col1_name], y_pred, 
             color='red', linewidth=2, label='Regression Line')
    
    plt.xlabel(f'{col1_label}', fontsize=12)
    plt.ylabel(f'{col2_label}', fontsize=12)
    plt.title(f'{col1_label} vs {col2_label}', fontsize=14, fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Add equation to plot
    plt.text(0.05, 0.95, f'y = {slope:.3f}x + {intercept:.3f}\nR² = {r2:.3f}',
             transform=plt.gca().transAxes, fontsize=11,
             verticalalignment='top', bbox=dict(boxstyle='round', 
             facecolor='wheat', alpha=0.5))
    
    plt.tight_layout()
    plt.show()

In [None]:
plot_X_vs_Y('minutes_studying','final_exam_score','Minutes Studying', 'Exam Score')

In [None]:
plot_X_vs_Y('num_pets','final_exam_score','Number of Pets', 'Exam Score')

In [None]:
plot_X_vs_Y('screen_time_minutes','final_exam_score','Screen Time', 'Exam Score')

In [None]:
plot_X_vs_Y('minutes_studying','current_grade','minutes_studying','current_grade')

## Correlation matrix heat map

Let's get a quick visual representation of the relationshop between features in this dataset. First drop the non-predictive attributes. Use a **new name** so we don't pollute the original dataframe

In [None]:
df_heat = df.drop(["student_id"], axis = 1)

In [None]:
corr_matrix = df_heat.corr()

In [None]:
plt.figure(figsize=(16, 6))
# define the mask to set the values in the upper triangle to True
mask = np.triu(np.ones_like(corr_matrix, dtype=np.bool))
heatmap = sns.heatmap(corr_matrix, mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Test Scores Correlation Heatmap', fontdict={'fontsize':14}, pad=16);
plt.show()

Which features seem to be important?

In [None]:
row_filter = abs(corr_matrix[target])>0.1
top_features = pd.DataFrame(corr_matrix[target][row_filter])
top_features.sort_values(by=target)

## Data Modeling

Here we will run a linear regression. First we need to clean up the data a bit. We will create a data pipeline so we can repeat this process as needed.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

### Linear Regression

Three methods will load the data, preprocess it, and create X and y datasets for training and testing.

In [None]:
def get_data(filename):
    df = pd.read_csv(filename)
    return df    

In [None]:

def pre_process_data(df, one_hot_encode = False):
    target = "final_exam_score"    
    simple_median = SimpleImputer(strategy='median')
    simple_most_freq = SimpleImputer(strategy='most_frequent')
    
    num_cols = df.select_dtypes(include=np.number).columns
    cat_cols = df.select_dtypes(include=object).columns

    df[num_cols] = simple_median.fit_transform(df[num_cols])
    df[cat_cols] = simple_most_freq.fit_transform(df[cat_cols])
    
    if one_hot_encode:
        O_encoder = OrdinalEncoder()
        df[cat_cols]= O_encoder.fit_transform(df[cat_cols])

        # df = pd.get_dummies(df, dtype=int)
        
    return df

In [None]:
def get_test_train(df, test_size = 0.2, random_state = 42):
    target = "final_exam_score"    
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [None]:
df = get_data(data_url)
X_train, X_test, y_train, y_test = get_test_train(df)

In [None]:
lreg = LinearRegression()
model = lreg.fit(X_train, y_train)

y_pred = lreg.predict(X_test)
print(f"Train R-squared = {r2_score(lreg.predict(X_train), y_train):5.3}")
print(f"Test R-squared  = {r2_score(y_pred, y_test):5.3}")

In [None]:
plt.scatter(y_test, y_pred);
plt.plot([60,100],[60,100],color='red')
plt.title("Predicted Score vs. Real Score")
plt.xlabel("True Exam Score")
plt.ylabel("Predicted Exam Score")

## Advanced Techniques

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

target = 'final_exam_score'
df = get_data(data_url)
features = df.drop(target, axis=1).select_dtypes(include=['number']).columns

# Set up a grid of subplots
num_features = len(features)
num_cols = 5  # Adjust this to your preference
num_rows = (num_features + num_cols - 1) // num_cols  # Calculates rows needed

plt.figure(figsize=(15, num_rows * 3))

for i, feature in enumerate(features, 1):
    plt.subplot(num_rows, num_cols, i)
    plt.scatter(df[feature], df[target], alpha=0.5)
    plt.title(f'{feature} vs. exam_score')
    plt.xlabel(feature)
    plt.ylabel(target)

plt.tight_layout()
plt.show()


## Importance Analysis

In [None]:
import statsmodels.api as sm

# Fit the OLS model
model = sm.OLS(y_train, X_train).fit()

# Get a summary of the regression
print(model.summary())

In [None]:
# Extract the summary table as a DataFrame
summary_table = model.summary2().tables[1]  # tables[1] is the coefficients table in summary2()

# Sort by p-values (for example)
sorted_summary = summary_table.sort_values(by='t')


# Set display options to prevent truncation
pd.options.display.max_rows = None  # Shows all rows
pd.options.display.max_columns = None  # Shows all columns

sorted_summary