In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [33]:
import sklearn
print("🔍 scikit-learn version:", sklearn.__version__) 

🔍 scikit-learn version: 1.7.0


In [34]:
# Clean old model file if it still exists
model_path = 'model/model.pkl'
if os.path.exists(model_path):
    os.remove(model_path)
    print("🗑️ Old model.pkl deleted.")

🗑️ Old model.pkl deleted.


In [35]:
# Load dataset
df = pd.read_excel('data/undergraduate_data.xlsx')
# Define features and target
target = 'TotalCost'
X = df.drop(columns=[target, 'ID'])
y = df[target]

In [36]:
# Split columns
categorical = ['Gender', 'Year', 'Lifestyle', 'AccommodationType', 'FinanceSources', 'EarnIncome']
numeric = [col for col in X.columns if col not in categorical]

In [37]:
# Step 4: Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numeric)
])

In [38]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [40]:
r2 = r2_score(y_test, pipeline.predict(X_test))
mse = mean_squared_error(y_test, pipeline.predict(X_test))
rmse = np.sqrt(mse)

print(f"✅ Model trained. R²: {r2:.3f}, RMSE: {rmse:.2f}")

✅ Model trained. R²: 0.954, RMSE: 45382.84


In [41]:
# Save model with latest sklearn
joblib.dump(model, model_path)
print("💾 Model saved cleanly with scikit-learn 1.7.0")

💾 Model saved cleanly with scikit-learn 1.7.0


In [42]:
y_pred = model.predict(X_test)

In [43]:
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"🔍 R² Score (Accuracy): {r2:.4f}")
print(f"📉 Mean Absolute Error (MAE): ₦{mae:,.2f}")
print(f"📊 Root Mean Squared Error (RMSE): ₦{rmse:,.2f}")

🔍 R² Score (Accuracy): 0.9537
📉 Mean Absolute Error (MAE): ₦35,485.12
📊 Root Mean Squared Error (RMSE): ₦45,382.84


In [44]:
new_student = pd.DataFrame([{
    'Age': 21,
    'Gender': 'Male',
    'Year': 'Staylite',
    'Lifestyle': 'Moderate',
    'AccommodationType': 'Off-Campus',
    'Rent': 250000,
    'Feeding': 180000,
    'TransportCost': 80000,
    'Textbooks': 35000,
    'Insurance': 18000,
    'Medical': 20000,
    'SubCost': 10000,
    'Social': 40000,
    'OtherExpenses': 25000,
    'FinanceSources': 'Parents + Aid',
    'EarnIncome': 'Yes',
    'YearlyIncome': 120000,
    'AidAmount': 100000
}])

In [47]:
predicted_cost = model.predict(new_student)[0]

In [48]:
print(f"💰 Forecasted Total Yearly Cost for this student: ₦{predicted_cost:,.2f}")

💰 Forecasted Total Yearly Cost for this student: ₦885,767.60


In [49]:
# User-provided category data
user_input = {
    'Gender': 'Female',
    'Year': 'Fresher',
    'Lifestyle': 'Moderate',
    'AccommodationType': 'Campus Hostel',
    'FinanceSources': 'Parents + Aid',
    'EarnIncome': 'No'
}

In [50]:
# Filter rows that match the categorical fields
matches = df[
    (df['Gender'] == user_input['Gender']) &
    (df['Year'] == user_input['Year']) &
    (df['Lifestyle'] == user_input['Lifestyle']) &
    (df['AccommodationType'] == user_input['AccommodationType']) &
    (df['FinanceSources'] == user_input['FinanceSources']) &
    (df['EarnIncome'] == user_input['EarnIncome'])
]

# Define numerical fields to extract
numeric_cols = [
    'Age', 'Rent', 'Feeding', 'TransportCost', 'Textbooks', 'Insurance',
    'Medical', 'SubCost', 'Social', 'OtherExpenses', 'YearlyIncome', 'AidAmount'
]

if not matches.empty:
    averages = matches[numeric_cols].mean().to_dict()
else:
    averages = df[numeric_cols].mean().to_dict()

full_input = {**averages, **user_input}

X_new = pd.DataFrame([full_input])


In [51]:

predicted_cost = model.predict(X_new)[0]

print(f" Forcasted Cost is: ₦{predicted_cost:,.2f}")

 Forcasted Cost is: ₦622,106.36
