In [None]:
# Model 2: Lognormal Monte Carlo Simulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from google.colab import files
import io
import warnings

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

# Step 1: Upload Excel file
print("Please upload your Excel file (e.g., WBS with cost column)...")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Step 2: Load and clean data
df = pd.read_excel(io.BytesIO(uploaded[file_name]), sheet_name=0, header=1)

# Rename required columns
df.rename(columns={
    df.columns[2]: 'WBS Code',
    df.columns[3]: 'Task',
    df.columns[5]: 'Cost'
}, inplace=True)

df_clean = df[pd.to_numeric(df['Cost'], errors='coerce').notna()].copy()
df_clean['Cost'] = pd.to_numeric(df_clean['Cost'])

# Step 3: Derive lognormal parameters
df_clean['Mean'] = df_clean['Cost']
df_clean['StdDev'] = df_clean['Cost'] * 0.2
df_clean['Mu'] = np.log(df_clean['Mean']**2 / np.sqrt(df_clean['StdDev']**2 + df_clean['Mean']**2))
df_clean['Sigma'] = np.sqrt(np.log(1 + (df_clean['StdDev']**2 / df_clean['Mean']**2)))

# Step 4: Monte Carlo Simulation
NUM_SIMULATIONS = 10000
task_list = df_clean['Task'].tolist()
task_sim_matrix = np.zeros((NUM_SIMULATIONS, len(task_list)))

for i, row in df_clean.iterrows():
    samples = np.random.lognormal(mean=row['Mu'], sigma=row['Sigma'], size=NUM_SIMULATIONS)
    task_sim_matrix[:, i] = samples

total_costs = task_sim_matrix.sum(axis=1)

# Step 5: Summary statistics
mean_cost = total_costs.mean()
median_cost = np.percentile(total_costs, 50)
p90_cost = np.percentile(total_costs, 90)
p10_cost = np.percentile(total_costs, 10)
std_dev = total_costs.std()

print("Lognormal Model Summary:")
print(f"Mean Cost: £{mean_cost:,.2f}")
print(f"P50 (Median): £{median_cost:,.2f}")
print(f"P90: £{p90_cost:,.2f}")
print(f"P10: £{p10_cost:,.2f}")
print(f"Standard Deviation: £{std_dev:,.2f}")

# Step 6: Distribution plot
plt.figure(figsize=(10, 6))
sns.histplot(total_costs, bins=60, kde=True, color='salmon')
plt.axvline(mean_cost, color='red', linestyle='--', label=f'Mean: £{mean_cost:,.0f}')
plt.axvline(p90_cost, color='orange', linestyle='--', label=f'P90: £{p90_cost:,.0f}')
plt.axvline(p10_cost, color='purple', linestyle='--', label=f'P10: £{p10_cost:,.0f}')
plt.title('Simulation of Total Project Cost')
plt.xlabel('Total Project Cost (£)')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
plt.show()

# Step 7: Sensitivity Analysis
sensitivity_data = []
for i, task in enumerate(task_list):
    without_task = total_costs - task_sim_matrix[:, i]
    delta = total_costs.mean() - without_task.mean()
    sensitivity_data.append((task, delta))

sensitivity_df = pd.DataFrame(sensitivity_data, columns=['Task', 'Mean Contribution'])
sensitivity_df.sort_values(by='Mean Contribution', ascending=True, inplace=True)

plt.figure(figsize=(10, 6))
sns.barplot(x='Mean Contribution', y='Task', data=sensitivity_df, palette='Reds_r')
plt.axvline(0, color='black', linewidth=0.8)
plt.title("Tornado Chart : Sensitivity of Tasks")
plt.xlabel("Mean Contribution (£)")
plt.ylabel("Task")
plt.tight_layout()
plt.show()

# Step 8: Correlation Analysis
correlation_data = []
for i, task in enumerate(task_list):
    corr, _ = pearsonr(task_sim_matrix[:, i], total_costs)
    correlation_data.append((task, corr))

correlation_df = pd.DataFrame(correlation_data, columns=['Task', 'Correlation Coefficient'])
correlation_df.sort_values(by='Correlation Coefficient', ascending=True, inplace=True)

plt.figure(figsize=(10, 6))
sns.barplot(x='Correlation Coefficient', y='Task', data=correlation_df, palette='coolwarm')
plt.axvline(0, color='black', linewidth=0.8)
plt.title("Correlation : Task Cost vs Total Cost")
plt.xlabel("Pearson Correlation")
plt.ylabel("Task")
plt.tight_layout()
plt.show()
