In [5]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load and preprocess dataset
df = pd.read_csv("investment_dataset.csv")
df['Diversification'] = df['Diversification'].apply(ast.literal_eval)
div_df = pd.json_normalize(df['Diversification'])
df = pd.concat([df.drop('Diversification', axis=1), div_df], axis=1)

# Label encode 'Risk'
le = LabelEncoder()
df['Risk'] = le.fit_transform(df['Risk'])  # Save label mapping
risk_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

# Features and targets
X = df[['Age', 'Salary_lpa', 'SIP', 'Risk']]
y = df[['Equity', 'Bonds', 'FD', 'Real Estate']]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate on test set
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"🔍 Test Accuracy (R2 Score): {r2:.4f}")
print(f"📉 Mean Squared Error: {mse:.4f}")

# Save model and encoder
joblib.dump(model, "portfolio_model.pkl")
joblib.dump(le, "risk_encoder.pkl")

# --- New User Input Prediction Section ---
print("\n✅ New User Input Prediction")

# Simulate external new user data (can be dynamic or from form)
new_user = {
    'Age': 45,
    'Salary_lpa': 4.5,
    'SIP': 800,
    'Risk': 'Medium'  # Text input
}

# Preprocess user input
risk_encoded = risk_mapping[new_user['Risk']]
user_features = np.array([[new_user['Age'], new_user['Salary_lpa'], new_user['SIP'], risk_encoded]])

# Predict using trained model
model = joblib.load("portfolio_model.pkl")
predicted_div = model.predict(user_features)[0]

# Output recommendation
print("📊 Predicted Diversification:")
print(f"   Equity:       {predicted_div[0]:.2f}")
print(f"   Bonds:        {predicted_div[1]:.2f}")
print(f"   FD:           {predicted_div[2]:.2f}")
print(f"   Real Estate:  {predicted_div[3]:.2f}")


🔍 Test Accuracy (R2 Score): -16.5300
📉 Mean Squared Error: 0.0000

✅ New User Input Prediction
📊 Predicted Diversification:
   Equity:       0.40
   Bonds:        0.25
   FD:           0.15
   Real Estate:  0.10


