# Investment Decision Model
This notebook builds a **Random Forest Classifier** to predict investment decisions based on project data and explain results using **SHAP** values.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
import sklearn
print(f"Current scikit-learn version: {sklearn.__version__}")
print(f"Current scikit-learn version: {pd.__version__}")
print(f"Current scikit-learn version: {np.__version__}")

Current scikit-learn version: 1.6.1
Current scikit-learn version: 2.2.2
Current scikit-learn version: 2.0.2


## 2. Load Dataset

In [None]:

file_path = "/content/Investment_Dataset.xlsx"
df = pd.read_excel(file_path)
df.head()


Unnamed: 0,Project,Initial_Cost,Discount_Rate_%,Risk_Rating,Project_Type,Market_Condition,Duration_Years,Uneven_Cash_Flows,NPV,IRR_%,PI,Payback_Yrs,Decision
0,P1,-61776,9.34,High,Retail,Unstable,2,"[41103, 40679]",9842.01,20.97,1.159,1.51,accept
1,P2,-114615,15.84,Medium,Retail,Stable,3,"[59787, 58509, 64341]",21990.22,27.09,1.192,1.94,accept
2,P3,-42911,12.53,Medium,Infra,Stable,5,"[16719, 11953, 14899, 15330, 11763]",7920.53,20.07,1.185,2.96,accept
3,P4,-102395,10.39,Low,Tech,Stable,4,"[36935, 38356, 41275, 41377]",21086.02,19.42,1.206,2.66,accept
4,P5,-91200,12.34,Low,Retail,Stable,3,"[37694, 40643, 41855]",4079.91,14.89,1.045,2.31,accept


## 3. Feature Engineering — Parsing Uneven Cash Flows

In [None]:

def parse_cash_flow_stats(cf_str):
    """Extracts total inflows, average inflow, and volatility (std dev) from cash flow string."""
    try:
        cash_flows = [float(x.strip()) for x in str(cf_str).strip('[]').split(',')]
        return pd.Series({
            'Total_Cash_Inflows': sum(cash_flows),
            'Avg_Cash_Flow': np.mean(cash_flows),
            'CF_Volatility': np.std(cash_flows)
        })
    except:
        return pd.Series({
            'Total_Cash_Inflows': np.nan,
            'Avg_Cash_Flow': np.nan,
            'CF_Volatility': np.nan
        })

df = df.join(df['Uneven_Cash_Flows'].apply(parse_cash_flow_stats))
df.drop('Uneven_Cash_Flows', axis=1, inplace=True)
df.head()


Unnamed: 0,Project,Initial_Cost,Discount_Rate_%,Risk_Rating,Project_Type,Market_Condition,Duration_Years,NPV,IRR_%,PI,Payback_Yrs,Decision,Total_Cash_Inflows,Avg_Cash_Flow,CF_Volatility
0,P1,-61776,9.34,High,Retail,Unstable,2,9842.01,20.97,1.159,1.51,accept,81782.0,40891.0,212.0
1,P2,-114615,15.84,Medium,Retail,Stable,3,21990.22,27.09,1.192,1.94,accept,182637.0,60879.0,2502.985417
2,P3,-42911,12.53,Medium,Infra,Stable,5,7920.53,20.07,1.185,2.96,accept,70664.0,14132.8,1953.267048
3,P4,-102395,10.39,Low,Tech,Stable,4,21086.02,19.42,1.206,2.66,accept,157943.0,39485.75,1907.937548
4,P5,-91200,12.34,Low,Retail,Stable,3,4079.91,14.89,1.045,2.31,accept,120192.0,40064.0,1747.362012


## 4. Define Features (X) and Target (y)

In [None]:

X = df[['Initial_Cost', 'Discount_Rate_%', 'Risk_Rating', 'Project_Type',
        'Market_Condition', 'Duration_Years',
        'Total_Cash_Inflows', 'Avg_Cash_Flow', 'CF_Volatility']]
y = df['Decision']


## 5. Preprocessing Pipelines

In [None]:

numerical_features = ['Initial_Cost', 'Discount_Rate_%', 'Duration_Years',
                      'Total_Cash_Inflows', 'Avg_Cash_Flow', 'CF_Volatility']
categorical_features = ['Risk_Rating', 'Project_Type', 'Market_Condition']

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


## 6. Train-Test Split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


## 7. Model Definition and Training

In [None]:

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=300,
        max_depth=10,
        class_weight='balanced',
        random_state=42
    ))
])

print("Starting model training...")
model.fit(X_train, y_train)
print("Model training complete.")


Starting model training...
Model training complete.


## 8. Evaluation Metrics

In [None]:

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n" + "="*40)
print(f"FINAL MODEL ACCURACY: {accuracy:.3f}")
print("="*40)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



FINAL MODEL ACCURACY: 0.876

Classification Report:
              precision    recall  f1-score   support

      accept       0.86      0.88      0.87       153
      reject       0.89      0.88      0.88       170

    accuracy                           0.88       323
   macro avg       0.88      0.88      0.88       323
weighted avg       0.88      0.88      0.88       323


Confusion Matrix:
[[134  19]
 [ 21 149]]


## 9. Feature Importance (Mean Decrease in Impurity)

In [None]:

feature_names = model.named_steps['preprocessor'].get_feature_names_out()
importances = model.named_steps['classifier'].feature_importances_
feat_imp = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
top_10 = feat_imp.sort_values(by='Importance', ascending=False).head(10)
print(top_10)

                           Feature  Importance
3          num__Total_Cash_Inflows    0.281252
0                num__Initial_Cost    0.242008
4               num__Avg_Cash_Flow    0.152028
5               num__CF_Volatility    0.123206
1             num__Discount_Rate_%    0.073531
2              num__Duration_Years    0.038176
10    cat__Project_Type_Healthcare    0.013017
8          cat__Risk_Rating_Medium    0.009850
14    cat__Market_Condition_Stable    0.008617
15  cat__Market_Condition_Unstable    0.008189


In [None]:
import joblib 
# --- 1. Define the filename for the saved model ---
filename = 'investment_decision_model.pkl'

# --- 2. Save the entire Pipeline object ---
try:
    joblib .dump(model, open(filename, 'wb'))
    print("="*50)
    print(f"✅ Successfully saved the model pipeline to: {filename}")
    print("="*50)
except NameError:
    print("Error: The 'model' variable (the trained Pipeline) is not defined. Please ensure the training code was run first.")
except Exception as e:
    print(f"An error occurred while saving the model: {e}")

✅ Successfully saved the model pipeline to: investment_decision_model.pkl
