# **Model 1**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pickle

In [2]:
# --- 1. Load Data ---
file_path = "/content/Investment_Dataset.xlsx"
df = pd.read_excel(file_path)

In [3]:
# --- 2. Feature Engineering & Data Reshaping (Wide to Long Format) ---

def parse_cash_flow_list(cf_str):
    """Safely parses the string representation of cash flows into a list of floats."""
    try:
        # Strip brackets and split by comma
        cash_flows = [float(x.strip()) for x in str(cf_str).strip('[]').split(',')]
        # Return a list of tuples: (cash_flow, year)
        return [(cf, i + 1) for i, cf in enumerate(cash_flows)]
    except:
        return []

# Apply the parsing function and create a new column with (cash_flow, year) tuples
df['Cash_Flows_List'] = df['Uneven_Cash_Flows'].apply(parse_cash_flow_list)

# Explode the DataFrame: creates one row for each annual cash flow
# We only need the project characteristics for the regressor, not the derived financials
df_regressor = df[['Project', 'Risk_Rating', 'Project_Type', 'Market_Condition', 'Cash_Flows_List']].explode('Cash_Flows_List')

# Separate the cash flow amount and the year into two new columns
df_regressor[['Cash_Flow_Amount', 'Year']] = pd.DataFrame(df_regressor['Cash_Flows_List'].tolist(), index=df_regressor.index)

# Drop intermediate columns and unnecessary columns
df_regressor.drop(['Cash_Flows_List', 'Project'], axis=1, inplace=True)

In [4]:
# --- 3. Define Features (X_reg) and Target (y_reg) ---
X_reg = df_regressor[['Risk_Rating', 'Project_Type', 'Market_Condition', 'Year']]
y_reg = df_regressor['Cash_Flow_Amount']


In [5]:
# --- 4. Preprocessing Pipeline Setup (Regressor) ---
numerical_features_reg = ['Year']
categorical_features_reg = ['Risk_Rating', 'Project_Type', 'Market_Condition']

# Note: We use the regressor model to predict a price, so no scaling on the output (Cash_Flow_Amount) is needed.
# We will scale the input 'Year' just in case.
preprocessor_reg = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_reg),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_reg)
    ],
    remainder='drop'
)

In [6]:
# --- 5. Train-Test Split ---
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_reg, y_reg, test_size=0.3, random_state=42
)


In [7]:
# --- 6. Model Definition and Training (Random Forest Regressor) ---
regressor_model = Pipeline(steps=[
    ('preprocessor', preprocessor_reg),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

print("Starting Cash Flow Regressor (Model 1) training...")
regressor_model.fit(X_reg_train, y_reg_train)
print("Training complete.")

Starting Cash Flow Regressor (Model 1) training...
Training complete.


In [11]:
# --- 7. Evaluation ---
y_reg_pred = regressor_model.predict(X_reg_test)
mse = mean_squared_error(y_reg_test, y_reg_pred)
rmse = np.sqrt(mse)
print(f"\nCash Flow Regressor Evaluation:")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")
print(f"Mean Cash Flow Amount (for context): ${y_reg_test.mean():,.2f}")


Cash Flow Regressor Evaluation:
Root Mean Squared Error (RMSE): $13,930.27
Mean Cash Flow Amount (for context): $22,955.84


In [12]:
# --- 8. Save Model 1 ---
model_filename_reg = 'cash_flow_regressor.pkl'
with open(model_filename_reg, 'wb') as file:
    pickle.dump(regressor_model, file)

print(f"\n✅ Cash Flow Regressor (Model 1) saved as: {model_filename_reg}")


✅ Cash Flow Regressor (Model 1) saved as: cash_flow_regressor.pkl


# **Model 2**

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [17]:
# --- 1. Load Data (wide format) ---
file_path = "/content/Investment_Dataset.xlsx"
df = pd.read_excel(file_path)

In [18]:
# --- 2. Feature Engineering (recreate stats from cash flow strings) ---
def parse_cash_flow_stats(cf_str):
    """Extracts total inflows, average inflow, and volatility (std dev)."""
    try:
        cash_flows = [float(x.strip()) for x in str(cf_str).strip('[]').split(',')]
        return pd.Series({
            'Total_Cash_Inflows': sum(cash_flows),
            'Avg_Cash_Flow': np.mean(cash_flows),
            'CF_Volatility': np.std(cash_flows)
        })
    except:
        return pd.Series({
            'Total_Cash_Inflows': np.nan,
            'Avg_Cash_Flow': np.nan,
            'CF_Volatility': np.nan
        })

df = df.join(df['Uneven_Cash_Flows'].apply(parse_cash_flow_stats))
df.drop('Uneven_Cash_Flows', axis=1, inplace=True)

In [19]:
# --- 3. Define Features (X_cls) and Target (y_cls) ---
X_cls = df[['Initial_Cost', 'Discount_Rate_%', 'Risk_Rating', 'Project_Type',
            'Market_Condition', 'Duration_Years',
            'Total_Cash_Inflows', 'Avg_Cash_Flow', 'CF_Volatility']]
y_cls = df['Decision']

In [20]:
# --- 4. Preprocessing Pipelines Setup (Classifier) ---
numerical_features_cls = ['Initial_Cost', 'Discount_Rate_%', 'Duration_Years',
                          'Total_Cash_Inflows', 'Avg_Cash_Flow', 'CF_Volatility']
categorical_features_cls = ['Risk_Rating', 'Project_Type', 'Market_Condition']

preprocessor_cls = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_cls),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_cls)
    ],
    remainder='drop'
)

In [21]:
# --- 5. Train-Test Split (just for validation metrics) ---
X_cls_train, X_cls_test, y_cls_train, y_cls_test = train_test_split(
    X_cls, y_cls, test_size=0.3, random_state=42, stratify=y_cls
)

In [22]:
# --- 6. Model Definition and Training (Random Forest Classifier) ---
classifier_model = Pipeline(steps=[
    ('preprocessor', preprocessor_cls),
    ('classifier', RandomForestClassifier(
        n_estimators=300,
        max_depth=10,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

print("\nStarting Decision Classifier (Model 2) training...")
classifier_model.fit(X_cls_train, y_cls_train)
print("Training complete.")


Starting Decision Classifier (Model 2) training...
Training complete.


In [23]:
# --- 7. Evaluation ---
y_cls_pred = classifier_model.predict(X_cls_test)
accuracy = accuracy_score(y_cls_test, y_cls_pred)
print(f"\nDecision Classifier Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_cls_test, y_cls_pred))


Decision Classifier Evaluation:
Accuracy: 0.8762

Classification Report:
              precision    recall  f1-score   support

      accept       0.86      0.88      0.87       153
      reject       0.89      0.88      0.88       170

    accuracy                           0.88       323
   macro avg       0.88      0.88      0.88       323
weighted avg       0.88      0.88      0.88       323



In [24]:
# --- 8. Save Model 2 ---
model_filename_cls = 'decision_classifier.pkl'
joblib.dump(classifier_model, model_filename_cls)

print(f"\n✅ Decision Classifier (Model 2) saved as: {model_filename_cls}")


✅ Decision Classifier (Model 2) saved as: decision_classifier.pkl
