In [6]:
import pandas as pd

In [5]:
# Load the dataset to explore its structure and contents
mat_data_path = "student/student-mat.csv"
student_mat_data = pd.read_csv(mat_data_path, delimiter=';')

por_data_path = "student/student-por.csv"
student_por_data = pd.read_csv(por_data_path, delimiter=';')

combined_data = pd.concat([student_mat_data, student_por_data], ignore_index=True)

combined_data.head(), combined_data.shape

NameError: name 'pd' is not defined

## Prediction on G3(Final Year Grade)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Selecting features and target for the model
features = ['sex', 'age', 'famsize', 'Medu', 'Fedu', 'Mjob', 'Fjob']
target = 'G3'

# Preparing the data
X = student_data[features]
y = student_data[target]

# One-hot encoding categorical variables
categorical_features = ['sex', 'famsize', 'Mjob', 'Fjob']
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

# Setting up the column transformer with OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features)],
    remainder='passthrough')

# Creating a pipeline that first transforms data then fits a Random Forest model
rf_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=0))
])

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fitting the Random Forest model
rf_model_pipeline.fit(X_train, y_train)

# Predicting test set results with the Random Forest model
y_rf_pred = rf_model_pipeline.predict(X_test)

# Evaluating the Random Forest model
rf_mse = mean_squared_error(y_test, y_rf_pred)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, y_rf_pred)

# Output the performance metrics
print("Root Mean Squared Error: ", rf_rmse)
print("R-squared: ", rf_r2)


Root Mean Squared Error:  5.607307011948271
R-squared:  -0.13965947376433285


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge

# Define the models and parameters to be tested
model_params = {
    'RandomForestRegressor': {
        'model': RandomForestRegressor(random_state=0),
        'params': {
            'regressor__n_estimators': [50, 100, 200],
            'regressor__max_features': ['auto', 'sqrt', 'log2']
        }
    },
    'GradientBoostingRegressor': {
        'model': GradientBoostingRegressor(random_state=0),
        'params': {
            'regressor__n_estimators': [50, 100, 200],
            'regressor__learning_rate': [0.01, 0.1, 0.2],
            'regressor__max_depth': [3, 5, 7]
        }
    },
    'Ridge': {
        'model': Ridge(),
        'params': {
            'regressor__alpha': [1, 10, 100, 1000]
        }
    }
}

# List to store results
results = []

# Loop through each model/parameters combination and perform Grid Search
for model_name, mp in model_params.items():
    # Setup the pipeline
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', mp['model'])
    ])
    clf = GridSearchCV(pipe, mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    results.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)
results_df

15 fits failed out of a total of 45.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/microsoft_hackthon/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/microsoft_hackthon/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/microsoft_hackthon/lib/python3.12/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit

Unnamed: 0,model,best_score,best_params
0,RandomForestRegressor,-0.149523,"{'regressor__max_features': 'sqrt', 'regressor..."
1,GradientBoostingRegressor,0.016423,"{'regressor__learning_rate': 0.01, 'regressor_..."
2,Ridge,0.05324,{'regressor__alpha': 100}


## New Dataset

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [8]:
score_data_path = "student_score.csv"
df = pd.read_csv(score_data_path)

# Display the first few rows of the dataset and the column names to understand its structure
df.head()

Unnamed: 0.1,Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
0,0,female,,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74
1,1,female,group C,some college,standard,,married,sometimes,yes,0.0,,5 - 10,69,90,88
2,2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
3,3,male,group A,associate's degree,free/reduced,none,married,never,no,1.0,,5 - 10,45,56,42
4,4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75


In [9]:
# Adding an average score column
df['AverageScore'] = df[['MathScore', 'ReadingScore', 'WritingScore']].mean(axis=1)

# Preparing the data with OneHotEncoder for categorical variables
categorical_features = ['Gender', 'EthnicGroup', 'ParentEduc', 'LunchType', 'TestPrep', 'ParentMaritalStatus', 'PracticeSport', 'IsFirstChild', 'WklyStudyHours']
numeric_features = ['MathScore', 'ReadingScore', 'WritingScore']  # dropped for a real prediction model, included here for data prep demonstration

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features)
    ])

# Splitting data into training and testing sets
X = df[categorical_features + numeric_features]
y = df['AverageScore']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Creating a pipeline that includes preprocessing and a regression model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Fitting the model
model.fit(X_train, y_train)

# Predicting test set results
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R-squared:", r2)

RMSE: 2.1829618018382412e-14
R-squared: 1.0


In [10]:
# Model configurations for GridSearch, updated to remove 'normalize' and add scaling
models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {
            'regressor__fit_intercept': [True, False]
        }
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(),
        'params': {
            'regressor__n_estimators': [10, 50, 100],
            'regressor__max_features': ['auto', 'sqrt', 'log2']
        }
    },
    'GradientBoostingRegressor': {
        'model': GradientBoostingRegressor(),
        'params': {
            'regressor__n_estimators': [100, 200],
            'regressor__learning_rate': [0.01, 0.1],
            'regressor__max_depth': [3, 5]
        }
    },
    'Ridge': {
        'model': Ridge(),
        'params': {
            'regressor__alpha': [1, 10, 100]
        }
    }
}

# Adjusting the pipeline to include StandardScaler for Linear Regression
for name, spec in models.items():
    if name == 'LinearRegression':
        pipe = Pipeline([
            ('preprocessor', preprocessor),
            ('scaler', StandardScaler()),  # Adding a scaler here
            ('regressor', spec['model'])
        ])
    else:
        pipe = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', spec['model'])
        ])

    clf = GridSearchCV(pipe, spec['params'], cv=5, scoring='neg_mean_squared_error')
    clf.fit(X_train, y_train)
    results.append({
        'model': name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

# Convert results to DataFrame for easy viewing
results_df = pd.DataFrame(results)
print(results_df)

NameError: name 'results' is not defined

In [None]:
import joblib

best_model = max(results, key=lambda x: x['best_score'])

# Now let's find the actual model object from the clf that corresponds to the best model
if best_model['model'] == 'LinearRegression':
    best_estimator = clf.best_estimator_  # clf should be your last GridSearchCV object for LinearRegression
else:
    # Loop through clf objects to find the matching model 
    best_params = best_model['best_params']
    final_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()) if best_model['model'] == 'LinearRegression' else None,
        ('regressor', models[best_model['model']]['model'])
    ])
    final_clf = GridSearchCV(final_pipe, {f'regressor__{k}': [v] for k, v in best_params.items()}, cv=5, scoring='neg_mean_squared_error')
    final_clf.fit(X_train, y_train)
    best_estimator = final_clf.best_estimator_

# Save the best model
joblib.dump(best_estimator, 'best_predict_model.pkl')
print("Best model saved as 'best_predict_model.pkl'")

Best model saved as 'best_predict_model.pkl'


## Try NN

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [12]:

# Define the preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numeric_features)  # Scaling numeric features
    ])

# Splitting the data into training and testing sets
X = df[categorical_features + numeric_features]
y = df['AverageScore']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the preprocessing pipeline on the training data and transform training and testing data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Neural network architecture
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),  # Adjusted to match the input feature size
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train the model
history = model.fit(X_train, y_train, epochs=50, validation_split=0.2, verbose=1)

# Evaluate the model on the test set
test_loss = model.evaluate(X_test, y_test, verbose=0)
print(f'Test MSE: {test_loss}')

# Model summary
model.summary()

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 462us/step - loss: 1199.5342 - val_loss: 0.4382
Epoch 2/50
[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 384us/step - loss: 0.2642 - val_loss: 0.1991
Epoch 3/50
[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 392us/step - loss: 0.1164 - val_loss: 0.1166
Epoch 4/50
[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 410us/step - loss: 0.0661 - val_loss: 0.0820
Epoch 5/50
[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 391us/step - loss: 0.0451 - val_loss: 0.0590
Epoch 6/50
[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 418us/step - loss: 0.0341 - val_loss: 0.0387
Epoch 7/50
[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 406us/step - loss: 0.0247 - val_loss: 0.0286
Epoch 8/50
[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388us/step - loss: 0.0186 - val_loss: 0.0221
Epoch 9/50
[1m613/613[0m [32m