In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib


from google.colab import files
uploaded = files.upload()


file_name = list(uploaded.keys())[0]
print(f"File '{file_name}' uploaded successfully.")

file_extension = file_name.split('.')[-1]
if file_extension == 'csv':
    df = pd.read_csv(file_name)
elif file_extension in ['xls', 'xlsx']:
    df = pd.read_excel(file_name)
else:
    raise ValueError("Unsupported file type. Please upload a CSV or Excel file.")

print("\nFirst 5 rows of your uploaded data:")
print(df.head())
print("\nData description:")
print(df.describe())



Saving StudentPerformanceFactors.csv to StudentPerformanceFactors (4).csv
File 'StudentPerformanceFactors (4).csv' uploaded successfully.

First 5 rows of your uploaded data:
   Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0             23          84                  Low                High   
1             19          64                  Low              Medium   
2             24          98               Medium              Medium   
3             29          89                  Low              Medium   
4             19          92               Medium              Medium   

  Extracurricular_Activities  Sleep_Hours  Previous_Scores Motivation_Level  \
0                         No            7               73              Low   
1                         No            8               59              Low   
2                        Yes            7               91           Medium   
3                        Yes            8               98           M

In [31]:

categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"\nCategorical columns identified: {categorical_cols}")


if target_column in categorical_cols:
    categorical_cols.remove(target_column)
    print(f"Removed target column '{target_column}' from categorical list.")


df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


all_feature_columns = [col for col in df_encoded.columns if col != target_column and pd.api.types.is_numeric_dtype(df_encoded[col])]


X = df_encoded[all_feature_columns].copy()
y = df_encoded[target_column].copy()

for col in X.columns:
    if pd.api.types.is_numeric_dtype(X[col]):
        X.loc[:, col] = pd.to_numeric(X[col], errors='coerce')
    else:
        print(f"Warning: Feature column '{col}' is not numeric after encoding. It will be ignored for training.")
        X = X.drop(columns=[col])

if pd.api.types.is_numeric_dtype(y):
    y = pd.to_numeric(y, errors='coerce')
else:
    raise ValueError(f"Target column '{target_column}' is not numeric after processing. Please select a numeric target.")


df_processed_expanded = pd.concat([X, y], axis=1).dropna()

if df_processed_expanded.empty:
    raise ValueError("No valid numeric data found after processing and encoding. Please check your data and column selections.")

X = df_processed_expanded.drop(columns=[target_column])
y = df_processed_expanded[target_column]

print(f"\nUsing {X.shape[1]} features (including encoded) and '{target_column}' as target.")
print(f"Processed dataset size after encoding: {df_processed_expanded.shape[0]} rows.")
print("\nFirst 5 rows of the expanded feature set (X):")
display(X.head())



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size with expanded features: {X_train.shape[0]} samples")
print(f"Testing set size with expanded features: {X_test.shape[0]} samples")


Categorical columns identified: ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender']

Using 27 features (including encoded) and 'Exam_Score' as target.
Processed dataset size after encoding: 6607 rows.

First 5 rows of the expanded feature set (X):


Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Parental_Involvement_Low,Parental_Involvement_Medium,Access_to_Resources_Low,Access_to_Resources_Medium,...,Teacher_Quality_Medium,School_Type_Public,Peer_Influence_Neutral,Peer_Influence_Positive,Learning_Disabilities_Yes,Parental_Education_Level_High School,Parental_Education_Level_Postgraduate,Distance_from_Home_Moderate,Distance_from_Home_Near,Gender_Male
0,23,84,7,73,0,3,True,False,False,False,...,True,True,False,True,False,True,False,False,True,True
1,19,64,8,59,2,4,True,False,False,True,...,True,True,False,False,False,False,False,True,False,False
2,24,98,7,91,2,4,False,True,False,True,...,True,True,True,False,False,False,True,False,True,True
3,29,89,8,98,1,4,True,False,False,True,...,True,True,False,False,False,True,False,True,False,True
4,19,92,6,65,3,4,False,True,False,True,...,False,True,True,False,False,False,False,False,True,False



Training set size with expanded features: 5285 samples
Testing set size with expanded features: 1322 samples


In [9]:

print("\n--- Training and Evaluating Multiple Regression Models with Expanded Features ---")

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
}

results = {}
best_r2 = -float('inf')
best_model_name = None
best_model = None
best_y_pred = None

for name, model in models.items():
    print(f"\nTraining {name}...")
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)

        results[name] = {
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'R2': r2
        }

        print(f"{name} Performance:")
        print(f"  Mean Absolute Error (MAE): {mae:.2f}")
        print(f"  Mean Squared Error (MSE): {mse:.2f}")
        print(f"  Root Mean Squared Error (RMSE): {rmse:.2f}")
        print(f"  R-squared (R2): {r2:.2f}")


        if r2 > best_r2:
            best_r2 = r2
            best_model_name = name
            best_model = model
            best_y_pred = y_pred
    except Exception as e:
        print(f"Error training {name}: {e}")
        continue

print(f"\n--- Model Comparison with Expanded Features ---")
results_df = pd.DataFrame(results).T
print(results_df.sort_values(by='R2', ascending=False))

if best_model_name:
    print(f"\nBest performing model based on R-squared: {best_model_name} (R2: {best_r2:.2f})")
else:
    print("\nNo models were successfully trained.")


--- Training and Evaluating Multiple Regression Models with Expanded Features ---

Training Linear Regression...
Linear Regression Performance:
  Mean Absolute Error (MAE): 0.45
  Mean Squared Error (MSE): 3.26
  Root Mean Squared Error (RMSE): 1.80
  R-squared (R2): 0.77

Training Decision Tree Regressor...
Decision Tree Regressor Performance:
  Mean Absolute Error (MAE): 1.88
  Mean Squared Error (MSE): 14.05
  Root Mean Squared Error (RMSE): 3.75
  R-squared (R2): 0.01

Training Random Forest Regressor...
Random Forest Regressor Performance:
  Mean Absolute Error (MAE): 1.18
  Mean Squared Error (MSE): 4.98
  Root Mean Squared Error (RMSE): 2.23
  R-squared (R2): 0.65

Training Gradient Boosting Regressor...
Gradient Boosting Regressor Performance:
  Mean Absolute Error (MAE): 0.82
  Mean Squared Error (MSE): 3.82
  Root Mean Squared Error (RMSE): 1.96
  R-squared (R2): 0.73

--- Model Comparison with Expanded Features ---
                                  MAE        MSE      RMSE 

In [10]:

best_model_name_prev = results_df.sort_values(by='R2', ascending=False).index[0]
best_r2_prev = results_df.loc[best_model_name_prev, 'R2']

print(f"\nBest model from previous step: {best_model_name_prev} (R2: {best_r2_prev:.2f})")


model_to_tune_name = 'Gradient Boosting Regressor'
model_to_tune = models[model_to_tune_name]

print(f"\nProceeding with hyperparameter tuning for: {model_to_tune_name}")


param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}


Best model from previous step: Linear Regression (R2: 0.77)

Proceeding with hyperparameter tuning for: Gradient Boosting Regressor


In [12]:

categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"\nCategorical columns identified: {categorical_cols}")


if target_column in categorical_cols:
    categorical_cols.remove(target_column)
    print(f"Removed target column '{target_column}' from categorical list.")


df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


all_feature_columns = [col for col in df_encoded.columns if col != target_column and pd.api.types.is_numeric_dtype(df_encoded[col])]


X = df_encoded[all_feature_columns]
y = df_encoded[target_column]


for col in X.columns:
    if pd.api.types.is_numeric_dtype(X[col]):
        X.loc[:, col] = pd.to_numeric(X[col], errors='coerce')
    else:
        print(f"Warning: Feature column '{col}' is not numeric after encoding. It will be ignored for training.")
        X = X.drop(columns=[col])

if pd.api.types.is_numeric_dtype(y):
    y = pd.to_numeric(y, errors='coerce')
else:
    raise ValueError(f"Target column '{target_column}' is not numeric after processing. Please select a numeric target.")


df_processed_expanded = pd.concat([X, y], axis=1).dropna()

if df_processed_expanded.empty:
    raise ValueError("No valid numeric data found after processing and encoding. Please check your data and column selections.")

X = df_processed_expanded.drop(columns=[target_column])
y = df_processed_expanded[target_column]

print(f"\nUsing {X.shape[1]} features (including encoded) and '{target_column}' as target.")
print(f"Processed dataset size after encoding: {df_processed_expanded.shape[0]} rows.")
print("\nFirst 5 rows of the expanded feature set (X):")
display(X.head())



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size with expanded features: {X_train.shape[0]} samples")
print(f"Testing set size with expanded features: {X_test.shape[0]} samples")


Categorical columns identified: ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender']

Using 27 features (including encoded) and 'Exam_Score' as target.
Processed dataset size after encoding: 6607 rows.

First 5 rows of the expanded feature set (X):


Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Parental_Involvement_Low,Parental_Involvement_Medium,Access_to_Resources_Low,Access_to_Resources_Medium,...,Teacher_Quality_Medium,School_Type_Public,Peer_Influence_Neutral,Peer_Influence_Positive,Learning_Disabilities_Yes,Parental_Education_Level_High School,Parental_Education_Level_Postgraduate,Distance_from_Home_Moderate,Distance_from_Home_Near,Gender_Male
0,23,84,7,73,0,3,True,False,False,False,...,True,True,False,True,False,True,False,False,True,True
1,19,64,8,59,2,4,True,False,False,True,...,True,True,False,False,False,False,False,True,False,False
2,24,98,7,91,2,4,False,True,False,True,...,True,True,True,False,False,False,True,False,True,True
3,29,89,8,98,1,4,True,False,False,True,...,True,True,False,False,False,True,False,True,False,True
4,19,92,6,65,3,4,False,True,False,True,...,False,True,True,False,False,False,False,False,True,False



Training set size with expanded features: 5285 samples
Testing set size with expanded features: 1322 samples


In [13]:

print("\n--- Training and Evaluating Multiple Regression Models with Expanded Features ---")

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
}

results = {}
best_r2 = -float('inf')
best_model_name = None
best_model = None
best_y_pred = None

for name, model in models.items():
    print(f"\nTraining {name}...")
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)

        results[name] = {
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'R2': r2
        }

        print(f"{name} Performance:")
        print(f"  Mean Absolute Error (MAE): {mae:.2f}")
        print(f"  Mean Squared Error (MSE): {mse:.2f}")
        print(f"  Root Mean Squared Error (RMSE): {rmse:.2f}")
        print(f"  R-squared (R2): {r2:.2f}")


        if r2 > best_r2:
            best_r2 = r2
            best_model_name = name
            best_model = model
            best_y_pred = y_pred
    except Exception as e:
        print(f"Error training {name}: {e}")
        continue

print(f"\n--- Model Comparison with Expanded Features ---")
results_df = pd.DataFrame(results).T
print(results_df.sort_values(by='R2', ascending=False))

if best_model_name:
    print(f"\nBest performing model based on R-squared: {best_model_name} (R2: {best_r2:.2f})")
else:
    print("\nNo models were successfully trained.")


--- Training and Evaluating Multiple Regression Models with Expanded Features ---

Training Linear Regression...
Linear Regression Performance:
  Mean Absolute Error (MAE): 0.45
  Mean Squared Error (MSE): 3.26
  Root Mean Squared Error (RMSE): 1.80
  R-squared (R2): 0.77

Training Decision Tree Regressor...
Decision Tree Regressor Performance:
  Mean Absolute Error (MAE): 1.88
  Mean Squared Error (MSE): 14.05
  Root Mean Squared Error (RMSE): 3.75
  R-squared (R2): 0.01

Training Random Forest Regressor...
Random Forest Regressor Performance:
  Mean Absolute Error (MAE): 1.18
  Mean Squared Error (MSE): 4.98
  Root Mean Squared Error (RMSE): 2.23
  R-squared (R2): 0.65

Training Gradient Boosting Regressor...
Gradient Boosting Regressor Performance:
  Mean Absolute Error (MAE): 0.82
  Mean Squared Error (MSE): 3.82
  Root Mean Squared Error (RMSE): 1.96
  R-squared (R2): 0.73

--- Model Comparison with Expanded Features ---
                                  MAE        MSE      RMSE 

In [14]:

best_model_name_prev = results_df.sort_values(by='R2', ascending=False).index[0]
best_r2_prev = results_df.loc[best_model_name_prev, 'R2']

print(f"\nBest model from previous step: {best_model_name_prev} (R2: {best_r2_prev:.2f})")


model_to_tune_name = 'Gradient Boosting Regressor'
model_to_tune = models[model_to_tune_name]

print(f"\nProceeding with hyperparameter tuning for: {model_to_tune_name}")


param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}


Best model from previous step: Linear Regression (R2: 0.77)

Proceeding with hyperparameter tuning for: Gradient Boosting Regressor


In [15]:
from sklearn.model_selection import GridSearchCV


grid_search = GridSearchCV(
    estimator=model_to_tune,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)


print("\nRunning GridSearchCV...")
grid_search.fit(X_train, y_train)

print("\nBest parameters found:")
print(grid_search.best_params_)
print("\nBest R2 score from GridSearchCV:")
print(grid_search.best_score_)


Running GridSearchCV...
Fitting 5 folds for each of 243 candidates, totalling 1215 fits

Best parameters found:
{'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}

Best R2 score from GridSearchCV:
0.6984744729533732


In [16]:

best_gbr = grid_search.best_estimator_


y_pred_tuned = best_gbr.predict(X_test)

mae_tuned = mean_absolute_error(y_test, y_pred_tuned)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
rmse_tuned = np.sqrt(mse_tuned)
r2_tuned = r2_score(y_test, y_pred_tuned)

print(f"\nTuned Gradient Boosting Regressor Performance:")
print(f"  Mean Absolute Error (MAE): {mae_tuned:.2f}")
print(f"  Mean Squared Error (MSE): {mse_tuned:.2f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_tuned:.2f}")
print(f"  R-squared (R2): {r2_tuned:.2f}")

print(f"\nPrevious best model ({best_model_name_prev}) R2 score: {best_r2_prev:.2f}")
print(f"Tuned Gradient Boosting Regressor R2 score: {r2_tuned:.2f}")

if r2_tuned > best_r2_prev:
    print("\nThe tuned Gradient Boosting Regressor performs better than the previous best model.")
else:
    print("\nThe tuned Gradient Boosting Regressor does not perform better than the previous best model.")

if r2_tuned > 0.8:
    print("\nCongratulations! We have achieved an R2 score above 80%.")
else:
    print("\nWe are very close to our goal! The R2 score is now significantly improved.")


Tuned Gradient Boosting Regressor Performance:
  Mean Absolute Error (MAE): 0.68
  Mean Squared Error (MSE): 3.58
  Root Mean Squared Error (RMSE): 1.89
  R-squared (R2): 0.75

Previous best model (Linear Regression) R2 score: 0.77
Tuned Gradient Boosting Regressor R2 score: 0.75

The tuned Gradient Boosting Regressor does not perform better than the previous best model.

We are very close to our goal! The R2 score is now significantly improved.


In [17]:
from google.colab import files


files.download('linear_regression_model.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import joblib

# We need to recreate the preprocessor that was used to train the model.
# This preprocessor will one-hot encode the categorical features and leave the numerical features as they are.

# Identify categorical and numerical columns from the original dataframe
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
if 'Exam_Score' in categorical_cols:
    categorical_cols.remove('Exam_Score') # Remove the target column

numerical_cols = ['Hours_Studied', 'Previous_Scores', 'Attendance']

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ],
    remainder='passthrough'
)

# We need to split the original dataframe to get the training data
# with the original categorical columns
X_orig = df.drop(columns=['Exam_Score'])
y_orig = df['Exam_Score']
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_orig, y_orig, test_size=0.2, random_state=42)


# Fit the preprocessor on the original training data
preprocessor.fit(X_train_orig)

# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor.joblib')

print("Preprocessor saved as 'preprocessor.joblib'")

# Download the preprocessor file
from google.colab import files
files.download('preprocessor.joblib')

Preprocessor saved as 'preprocessor.joblib'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
import joblib
# Load the preprocessor
preprocessor = joblib.load('preprocessor.joblib')

# Preprocess the training and testing data
X_train_processed = preprocessor.transform(X_train_orig)
X_test_processed = preprocessor.transform(X_test_orig)

# Create and train a new Linear Regression model on the full, preprocessed data
model_full = LinearRegression()
model_full.fit(X_train_processed, y_train_orig)

# Save the newly trained model
joblib.dump(model_full, 'linear_regression_model_full.joblib')

print("New model saved as 'linear_regression_model_full.joblib'")

# Download the new model file
from google.colab import files
files.download('linear_regression_model_full.joblib')

New model saved as 'linear_regression_model_full.joblib'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>