In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("StudentsPerformance.csv")

In [3]:
df.head(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [5]:
df.sample(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
593,female,group E,high school,standard,none,74,76,73
431,female,group C,high school,standard,none,61,72,70
908,female,group C,bachelor's degree,free/reduced,none,67,75,72
87,female,group D,associate's degree,standard,none,71,71,74
25,male,group A,master's degree,free/reduced,none,73,74,72
719,male,group E,associate's degree,free/reduced,completed,91,73,80
166,male,group C,high school,free/reduced,completed,53,51,51
737,female,group B,some college,free/reduced,completed,53,66,73
460,male,group C,bachelor's degree,free/reduced,none,53,58,55
537,female,group D,high school,standard,none,51,66,62


In [6]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [7]:
df.isna().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [19]:


# OPTION 1: RECOMMENDED - Save the full CSV to a file and load it
# Save all the content you provided (including the header line) as 'StudentsPerformance.csv'
df = pd.read_csv('StudentsPerformance.csv')   # Place file in your working directory

# OPTION 2: If you must use string (ONLY if you paste the FULL 1001 lines)
# from io import StringIO
# csv_data = """gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
# female,group B,bachelor's degree,standard,none,72,72,74
# ... paste ALL 1000 rows here exactly ...
# female,group D,some college,free/reduced,none,77,86,86"""
# df = pd.read_csv(StringIO(csv_data))

# Quick sanity check
print("Dataset shape:", df.shape)  # Should be (1000, 8)
print("Any missing values?\n", df.isnull().sum())

# Target and features
X = df.drop('math score', axis=1)
y = df['math score']

categorical_features = ['gender', 'race/ethnicity', 'parental level of education',
                        'lunch', 'test preparation course']
numerical_features = ['reading score', 'writing score']

# Preprocessor - with handle_unknown='ignore' to fix the unknown categories warning
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ])

# Model
model = RandomForestRegressor(n_estimators=200, random_state=42)

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Split - 20% test gives 200 samples, plenty for reliable metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\nTest RMSE: {rmse:.4f}")
print(f"Test R²: {r2:.4f}")

# Feature importances
ohe_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
feature_names = list(ohe_names) + numerical_features

importances = pipeline.named_steps['model'].feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance_df.head(10))

Dataset shape: (1000, 8)
Any missing values?
 gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64
Training samples: 800, Test samples: 200

Test RMSE: 6.0006
Test R²: 0.8520

Top 10 Most Important Features:
                                     Feature  Importance
12                             reading score    0.558329
13                             writing score    0.239531
0                                gender_male    0.120352
10                            lunch_standard    0.015644
11              test preparation course_none    0.011459
4                     race/ethnicity_group E    0.010635
8   parental level of education_some college    0.008013
2                     race/ethnicity_group C    0.006904
6    parental level of education_high school    0.00635

In [18]:


# Load your dataset
# If you saved it as a file:
df = pd.read_csv('StudentsPerformance.csv')

# If you're working in a notebook and the data is already loaded as df, skip the line above

# Quick check
print("Dataset shape:", df.shape)
print(df.info())

# Features and target
X = df.drop('math score', axis=1)
y = df['math score']

# Categorical and numerical columns
categorical_features = ['gender', 'race/ethnicity', 'parental level of education',
                        'lunch', 'test preparation course']
numerical_features = ['reading score', 'writing score']

# Preprocessor: One-hot encode categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)  # reading & writing scores pass through
    ])

# Gradient Boosting Regressor
# You can tune these hyperparameters later
gb_model = GradientBoostingRegressor(
    n_estimators=300,       # number of boosting stages
    learning_rate=0.05,     # smaller learning rate often better
    max_depth=4,            # controls tree complexity
    random_state=42,
    subsample=0.9,          # introduces stochasticity, reduces overfitting
    loss='squared_error'
)

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', gb_model)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

# Train the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\n=== Gradient Boosting Regressor Results ===")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R²:   {r2:.4f}")

# Feature Importances
# Get feature names after one-hot encoding
ohe = pipeline.named_steps['preprocessor'].named_transformers_['cat']
feature_names = ohe.get_feature_names_out(categorical_features).tolist() + numerical_features

importances = pipeline.named_steps['model'].feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("\nTop 15 Most Important Features:")
print(feature_importance_df.head(15))

Dataset shape: (1000, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
None
Training samples: 800, Test samples: 200

=== Gradient Boosting Regressor Results ===
Test RMSE: 5.6371
Test R²:   0.8694

Top 15 Most Important Features:
                                          Feature  Importance
12                         

In [17]:


# Install XGBoost if needed (run once)
# !pip install xgboost

import xgboost as xgb
from xgboost import XGBRegressor

# Load the dataset
df = pd.read_csv('StudentsPerformance.csv')  # Ensure the file is in your working directory

# Quick sanity check
print("Dataset shape:", df.shape)
print("Missing values:\n", df.isnull().sum())

# Features and target
X = df.drop('math score', axis=1)
y = df['math score']

# Categorical and numerical features
categorical_features = ['gender', 'race/ethnicity', 'parental level of education',
                        'lunch', 'test preparation course']
numerical_features = ['reading score', 'writing score']

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ])

# XGBoost model
xgb_model = XGBRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    objective='reg:squarederror',
    n_jobs=-1
)

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb_model)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

# Train
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\n=== XGBoost Regressor Results ===")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R²:   {r2:.4f}")

# Feature Importances (FIXED LINE HERE)
ohe = pipeline.named_steps['preprocessor'].named_transformers_['cat']
feature_names = ohe.get_feature_names_out(categorical_features).tolist() + numerical_features

importances = pipeline.named_steps['model'].feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)  # <-- Fixed: ascending=False

print("\nTop 15 Most Important Features:")
print(feature_importance_df.head(15))

Dataset shape: (1000, 8)
Missing values:
 gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64
Training samples: 800, Test samples: 200

=== XGBoost Regressor Results ===
Test RMSE: 5.9107
Test R²:   0.8564

Top 15 Most Important Features:
                                          Feature  Importance
0                                     gender_male    0.270520
1                                   reading score    0.230517
2                                   writing score    0.163896
3                                  lunch_standard    0.066297
4                          race/ethnicity_group E    0.062504
5                    test preparation course_none    0.034158
6                          race/ethnicity_group C    0.025952
7        parental level of education_som

In [16]:


# Load the dataset
df = pd.read_csv('StudentsPerformance.csv')  # Make sure the file is in your working directory

# Quick check
print("Dataset shape:", df.shape)
print("Missing values:\n", df.isnull().sum())

# Features and target
X = df.drop('math score', axis=1)
y = df['math score']

# Categorical and numerical features
categorical_features = ['gender', 'race/ethnicity', 'parental level of education',
                        'lunch', 'test preparation course']
numerical_features = ['reading score', 'writing score']

# Preprocessor: One-hot encode categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ])

# Random Forest Regressor
rf_model = RandomForestRegressor(
    n_estimators=300,        # Number of trees (more = better, but slower)
    max_depth=None,          # Let trees grow fully
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1                # Use all CPU cores
)

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf_model)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

# Train the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\n=== Random Forest Regressor Results ===")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R²:   {r2:.4f}")

# Feature Importances
ohe = pipeline.named_steps['preprocessor'].named_transformers_['cat']
feature_names = ohe.get_feature_names_out(categorical_features).tolist() + numerical_features

importances = pipeline.named_steps['model'].feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

print("\nTop 15 Most Important Features:")
print(feature_importance_df.head(15))

Dataset shape: (1000, 8)
Missing values:
 gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64
Training samples: 800, Test samples: 200

=== Random Forest Regressor Results ===
Test RMSE: 6.0266
Test R²:   0.8507

Top 15 Most Important Features:
                                          Feature  Importance
0                                   reading score    0.558510
1                                   writing score    0.238289
2                                     gender_male    0.121160
3                                  lunch_standard    0.015789
4                    test preparation course_none    0.011520
5                          race/ethnicity_group E    0.010542
6        parental level of education_some college    0.007887
7                          race/et

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from xgboost import XGBRegressor

# Load data
df = pd.read_csv('StudentsPerformance.csv')

# Features and target
X = df.drop('math score', axis=1)
y = df['math score']

# Categorical and numerical features
categorical_features = ['gender', 'race/ethnicity', 'parental level of education',
                        'lunch', 'test preparation course']
numerical_features = ['reading score', 'writing score']

# Preprocessor (same for all models)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ])

# Train-test split (same for all models)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary to store results
results = {}

# 1. Linear Regression (Baseline)
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])
lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)
results['Linear Regression'] = {
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_lr)),
    'R²': r2_score(y_test, y_pred_lr)
}

# 2. Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))
])
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
results['Random Forest'] = {
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_rf)),
    'R²': r2_score(y_test, y_pred_rf)
}

# 3. Gradient Boosting
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, 
                                        max_depth=4, random_state=42))
])
gb_pipeline.fit(X_train, y_train)
y_pred_gb = gb_pipeline.predict(X_test)
results['Gradient Boosting'] = {
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_gb)),
    'R²': r2_score(y_test, y_pred_gb)
}

# 4. XGBoost
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(n_estimators=400, learning_rate=0.05, max_depth=5,
                           subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1))
])
xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)
results['XGBoost'] = {
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_xgb)),
    'R²': r2_score(y_test, y_pred_xgb)
}

# Create comparison DataFrame
comparison_df = pd.DataFrame(results).T
comparison_df = comparison_df.round(4)

# Add difference columns (compared to Linear Regression)
comparison_df['RMSE Diff (vs Linear)'] = (comparison_df['RMSE'] - comparison_df.loc['Linear Regression', 'RMSE']).round(4)
comparison_df['R² Diff (vs Linear)'] = (comparison_df['R²'] - comparison_df.loc['Linear Regression', 'R²']).round(4)

# Reorder columns
comparison_df = comparison_df[['RMSE', 'R²', 'RMSE Diff (vs Linear)', 'R² Diff (vs Linear)']]

print("\n" + "="*60)
print("MODEL PERFORMANCE COMPARISON (Predicting Math Score)")
print("="*60)
print(comparison_df)
print("="*60)

# Highlight the best model
best_r2 = comparison_df['R²'].idxmax()
best_rmse = comparison_df['RMSE'].idxmin()
print(f"Best R²:  {best_r2} ({comparison_df.loc[best_r2, 'R²']})")
print(f"Best RMSE: {best_rmse} ({comparison_df.loc[best_rmse, 'RMSE']})")


MODEL PERFORMANCE COMPARISON (Predicting Math Score)
                     RMSE      R²  RMSE Diff (vs Linear)  R² Diff (vs Linear)
Linear Regression  5.3940  0.8804                 0.0000               0.0000
Random Forest      6.0266  0.8507                 0.6326              -0.0297
Gradient Boosting  5.6626  0.8682                 0.2686              -0.0122
XGBoost            5.9107  0.8564                 0.5167              -0.0240
Best R²:  Linear Regression (0.8804)
Best RMSE: Linear Regression (5.394)
