In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# 1. Load the dataset processed by Member 1
data = pd.read_csv('cleaned_student_data.csv')

# 2. Select the features we want to use for prediction
# We use the numerical columns created during cleaning
features = [
    '2. What is your average monthly income or allowance? ', 
    '4. What is your monthly rent / hostel fee? ',
    '5. How much do you spend on food per month? ',
    '6. What are your monthly transportation expenses? ',
    '7. How much do you spend on mobile recharge and internet per month? ',
    '8. What are your average education-related expenses per month? \n(Books, printing, subscriptions, etc.)',
    '9. How much do you spend monthly on eating out or food delivery? ',
    '10. How much do you spend monthly on shopping and entertainment?\n (Clothes, gadgets, movies, games, etc.) ',
    '13. How much do you save on average per month? '
]

X = data[features]
y = data['target']

# 3. Check for Class Imbalance
print("Class Distribution:")
print(y.value_counts())
print("-" * 30)

# 4. Split data into Training (80%) and Testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train Model 1: Random Forest
# We use class_weight='balanced' to handle the distribution of Yes/No answers
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)

# 6. Train Model 2: XGBoost
# Modern XGBoost doesn't need the label encoder setting anymore
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# 7. Evaluate the Models
def evaluate(model, name):
    preds = model.predict(X_test)
    print(f"--- {name} Performance ---")
    print(f"Accuracy: {accuracy_score(y_test, preds):.2%}")
    print(classification_report(y_test, preds))
    print("\n")

evaluate(rf_model, "Random Forest")
evaluate(xgb_model, "XGBoost")

# 8. Feature Importance (Which habit matters most?)
importance = pd.Series(rf_model.feature_importances_, index=features).sort_values(ascending=False)
print("Top factors influencing the budget:")
print(importance)

# 9. Save the best model for the UI (Member 4)
# We will save the Random Forest model as it is usually more stable on 171 rows
joblib.dump(rf_model, 'budget_predictor_model.pkl')
print("Model saved as budget_predictor_model.pkl")

Class Distribution:
target
1    113
0     58
Name: count, dtype: int64
------------------------------
--- Random Forest Performance ---
Accuracy: 74.29%
              precision    recall  f1-score   support

           0       0.43      0.38      0.40         8
           1       0.82      0.85      0.84        27

    accuracy                           0.74        35
   macro avg       0.62      0.61      0.62        35
weighted avg       0.73      0.74      0.74        35



--- XGBoost Performance ---
Accuracy: 54.29%
              precision    recall  f1-score   support

           0       0.21      0.38      0.27         8
           1       0.76      0.59      0.67        27

    accuracy                           0.54        35
   macro avg       0.49      0.48      0.47        35
weighted avg       0.64      0.54      0.58        35



Top factors influencing the budget:
13. How much do you save on average per month?                                                              