In [177]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

In [178]:
data = pd.read_csv(r'D:\Centennial College Materials\FIFTH SEMESTER\COMP 258 - NEURAL NETWORKS\PROJECT\COMP258-Project\data\Student data.csv')

In [179]:
adjusted_data = data[23:]

In [180]:
column_names = ['1st Term GPA', '2nd Term GPA', 'First Language', 'Funding', 'School', 'Fast Track', 'Coop',
                'Residency', 'Gender', 'Prev Education', 'Age Group', 'High School Average Mark', 'Math Score',
                'English Grade', 'FirstYearPersistence']

In [181]:
adjusted_data.columns = column_names

In [182]:
adjusted_data.head()

Unnamed: 0,1st Term GPA,2nd Term GPA,First Language,Funding,School,Fast Track,Coop,Residency,Gender,Prev Education,Age Group,High School Average Mark,Math Score,English Grade,FirstYearPersistence
23,0.0,0.0,1,2.0,6.0,2.0,1.0,1.0,2.0,1,1,59,16,7,1.0
24,2.5,2.0,3,4.0,6.0,1.0,2.0,2.0,2.0,1,3,?,?,7,1.0
25,4.25,3.923077,1,1.0,6.0,2.0,1.0,1.0,1.0,2,3,92,41,9,1.0
26,3.020833,2.321429,3,4.0,6.0,1.0,2.0,2.0,2.0,2,3,?,?,8,1.0
27,4.275,4.326923,1,2.0,6.0,1.0,1.0,1.0,1.0,2,3,97,?,9,1.0


In [183]:
# Convert the data types of the columns to the appropriate types

#* Numeric columns

numeric_columns = ['1st Term GPA', '2nd Term GPA', 'High School Average Mark', 'Math Score']

cat_columns = ['First Language', 'Funding', 'School', 'Fast Track', 'Coop', 'Residency', 'Gender', 'Prev Education', 'Age Group', 'English Grade', 'FirstYearPersistence']

# Convert numeric columns to numeric, coercing errors to NaN
adjusted_data[numeric_columns] = adjusted_data[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Convert categorical columns to category type
adjusted_data[cat_columns] = adjusted_data[cat_columns].astype('category')

In [184]:
adjusted_data.replace('?', np.nan, inplace=True)

In [185]:
# Drop all cat columns
new_data = adjusted_data[numeric_columns]

In [186]:
X = new_data.drop('2nd Term GPA', axis=1)
y = new_data['2nd Term GPA']

In [187]:
mean_cols = ['High School Average Mark', 'Math Score']
fill_with_0_cols = ['1st Term GPA']

In [188]:
mean_imputer = SimpleImputer(strategy='mean')
fill_with_0_imputer = SimpleImputer(strategy='constant', fill_value=0.0)
scaler = StandardScaler()

mean_pipeline = Pipeline([
    ('mean_imputer', mean_imputer),
    ('scaler', scaler)
])

fill_with_0_pipeline = Pipeline([
    ('fill_with_0_imputer', fill_with_0_imputer),
    ('scaler', scaler)
])

preprocessor = ColumnTransformer([
    ('mean_pipeline', mean_pipeline, mean_cols),
    ('fill_with_0_pipeline', fill_with_0_pipeline, fill_with_0_cols)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [189]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [190]:
X_train_prepared = pipeline.fit_transform(X_train)

In [191]:
y_train_imputed = fill_with_0_imputer.fit_transform(y_train.values.reshape(-1, 1))

In [192]:
train_data = pd.DataFrame(X_train_prepared, columns=X_train.columns)
val_data = pd.DataFrame(X_val, columns=X_val.columns)
test_data = pd.DataFrame(X_test, columns=X_test.columns)

In [193]:
train_data['2nd Term GPA'] = y_train_imputed
val_data['2nd Term GPA'] = y_val
test_data['2nd Term GPA'] = y_test

In [194]:
train_data.to_csv('train_data_m2.csv', index=False)
val_data.to_csv('val_data_m2_no_transform.csv', index=False)
test_data.to_csv('test_data_m2_no_transform.csv', index=False)

In [196]:
import joblib

joblib.dump(pipeline, 'final_pipeline_2.pkl')

['final_pipeline_2.pkl']