In [1]:
# importing all the dependencies
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.svm import SVC,SVR
from sklearn.linear_model import LogisticRegression,Lasso,Ridge,LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
# from catboost import CatBoostRegressor
# from xgboost import XGBClassifier,XGBRegressor
import warnings


In [2]:
## importing the dataset
df = pd.read_csv('/Users/arihantsingla/Documents/machine_learning_project/notebook/stud.csv')
df.head()

## checking the shape of the dataset
print(df.shape)
df.head()


(1000, 8)


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# Dataset Information

The dataset contains student performance data with the following characteristics:

- Number of rows: 1000 students
- Number of columns: 8 features
- Features include:
  - gender
  - race/ethnicity 
  - reading score
  - writing score
  - and other demographic/academic variables

The first few rows show a sample of student records with their corresponding scores and demographic information. This dataset will be used to analyze and predict student performance using various machine learning models.


In [3]:
## dropping the math score column
df1 =df.drop(columns=['math_score'],axis=1)
print(df1.head())
print('\n')

print("catergories in the gender column: ",df1['gender'].unique())
print('\n')
print("catergories in the race/ethnicity column: ",df1['race_ethnicity'].unique())
print('\n')
print("catergories in the lunch column: ",df1['lunch'].unique())
print('\n')
print("catergories in the test preparation course column: ",df1['test_preparation_course'].unique())

   gender race_ethnicity parental_level_of_education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test_preparation_course  reading_score  writing_score  
0                    none             72             74  
1               completed             90             88  
2                    none             95             93  
3                    none             57             44  
4                    none             78             75  


catergories in the gender column:  ['female' 'male']


catergories in the race/ethnicity column:  ['group B' 'group C' 'group A' 'group D' 'group E']


catergories in the lunch column:  ['standard' 'free/reduced']


catergories i

In [4]:
#target feature is math score
df2 = df['math_score']


In [6]:
# Get numeric and categorical features from original dataframe
num_features = df1.select_dtypes(exclude=['object']).columns
cat_features = df1.select_dtypes(include=['object']).columns

print("Numeric features:", num_features)
print("Categorical features:", cat_features)

## Transform features
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_feature_transformer = StandardScaler()
cat_feature_transformer = OneHotEncoder()  # Set sparse=False to get dense array

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_feature_transformer, num_features),
        ('cat', cat_feature_transformer, cat_features)
    ]
)

# Store transformed data in X (features matrix)
df1 = preprocessor.fit_transform(df1)

Numeric features: Index(['reading_score', 'writing_score'], dtype='object')
Categorical features: Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')


In [10]:
df1.shape

(1000, 19)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df1, df2, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)

X_train shape: (800, 19)
X_test shape: (200, 19)
y_train shape: (800,)


In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square



In [13]:
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    # 'XGBRegressor': XGBRegressor(),
    # 'CatBoostRegressor': CatBoostRegressor(verbose=False)
}

model_list = [] 
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')
    
    

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 5.3231
- Mean Absolute Error: 4.2667
- R2 Score: 0.8743
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.3940
- Mean Absolute Error: 4.2148
- R2 Score: 0.8804


Lasso
Model performance for Training set
- Root Mean Squared Error: 6.5938
- Mean Absolute Error: 5.2063
- R2 Score: 0.8071
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.5197
- Mean Absolute Error: 5.1579
- R2 Score: 0.8253


Ridge
Model performance for Training set
- Root Mean Squared Error: 5.3233
- Mean Absolute Error: 4.2650
- R2 Score: 0.8743
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.3904
- Mean Absolute Error: 4.2111
- R2 Score: 0.8806


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.2795
- Mean Absolute Error: 0.0187
- R2 Score: 0.9997
-------------------------------

# Model Performance Results

The following models were evaluated on both training and test sets:

- Linear Regression
- Lasso Regression 
- Ridge Regression
- Decision Tree
- Random Forest

Each model was assessed using:
- Root Mean Squared Error (RMSE)
- Mean Absolute Error (MAE) 
- R-squared (R2) Score

The R2 scores on the test set will help determine which model performed best at predicting student math scores based on the given features.

The results are printed above showing detailed metrics for each model's performance on both training and test datasets.


In [14]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.880593
0,Linear Regression,0.880433
4,Random Forest,0.847509
1,Lasso,0.82532
3,Decision Tree,0.718314
