In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import pi
import os


RESULT_DIR = '../results'
FIG_DIR = f'{RESULT_DIR}/figs'
os.makedirs(FIG_DIR, exist_ok=True)


# Clean data import phase
df = pd.read_csv("../data/dataCombined.csv")
df.head()

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Constants for each section prefix
DEMOGRAPHICS = "D_"
USAGE = "U_"
ADOPTION = "A_"
FACTORS = "F_"
PROJECT_DELIVERY = "P_"
SUSTAINABILITY = "S_"
BARRIERS = "B_"


PLOT_PALATTE = ["#003f5c", "#58508d", "#bc5090", "#ff6361", "#ffa600"]

In [None]:
# Frequency tables for categorical data
demographic_cols = [col for col in df.columns if col.startswith(DEMOGRAPHICS)]
usage_cols = [col for col in df.columns if col.startswith(USAGE)]

for col in demographic_cols + usage_cols:
    print(f"\n--- Frequency Table: {col} ---")
    print(df[col].value_counts().sort_index())

## Data summary

In [None]:
likert_cols = [col for col in df.columns if col.startswith((ADOPTION))]

means = df[likert_cols].mean()
medians = df[likert_cols].median()
modes = df[likert_cols].mode().iloc[0]

summary = pd.DataFrame({
    'Mean': means,
    'Median': medians,
    'Mode': modes
})

summary.head(10)

In [None]:
import matplotlib.pyplot as plt

# Define category prefixes
categories = {
    "Demographics": DEMOGRAPHICS,
    "Usage": USAGE,
    "Adoption": ADOPTION,
    "Factors": FACTORS,
    "Project Delivery": PROJECT_DELIVERY,
    "Sustainability": SUSTAINABILITY,
    "Barriers": BARRIERS
}

# Count columns in each category
category_counts = {}
for name, prefix in categories.items():
    category_counts[name] = len([col for col in df.columns if col.startswith(prefix)])

# Calculate "Other" for columns that don't match any category
other_cols = len(df.columns) - sum(category_counts.values())
if other_cols > 0:
    category_counts["Other"] = other_cols

# Create pie chart
plt.figure(figsize=(10, 8))
plt.pie(
    category_counts.values(),
    labels=category_counts.keys(),
    autopct='%1.1f%%',
    startangle=140,
    colors=plt.cm.Pastel1.colors
)

plt.title('Distribution of Columns by Category')
plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular
plt.show()

### Frequency plot function

In [None]:
def plot_categorical_distribution(
    df, 
    column, 
    x_labels_dict,
    x_label='Category', 
    y_label='Frequency', 
    title='Distribution', 
    filename='output.png',
):
        
    categories_present = sorted(df[column].dropna().unique())

    labels = [x_labels_dict[val] for val in categories_present]
    palette = sns.color_palette("pastel", n_colors=len(labels))

    plt.figure(figsize=(8, 5))
    ax = sns.countplot(x=column, data=df, order=categories_present, palette=palette)

    ax.set_xticks(range(len(labels)))
    ax.set_xticklabels(labels, fontsize=10)

    plt.xlabel(x_label, fontsize=12)
    plt.ylabel(y_label, fontsize=12)
    plt.title(title, fontsize=14, weight='bold')

    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width() / 2., height + 0.5,
                int(height), ha="center", fontsize=9)

    plt.grid(axis='y', linestyle='--', alpha=0.7)
    sns.despine()
    plt.tight_layout()

    plt.savefig(f"{FIG_DIR}/{filename}", dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()

# Frequency plots for Demographic fields

In [None]:

gender_labels_dict = {
    1: "Male",
    2: "Female"
}
# plot_categorical_distribution(
#     df=df,
#     column="D_Gender",
#     x_labels_dict=gender_labels_dict,
#     x_label="Gender",
#     y_label="Number of Respondents",
#     title="Gender Distribution",
#     filename="gender_distribution.png"
# )
age_labels_dict = {
    1: "<20", 
    2: "21-30", 
    3: "31-40", 
    4: "41-50", 
    5: "Over 50"
}
# plot_categorical_distribution(
#     df=df,
#     column="D_Age",
#     x_labels_dict=age_labels_dict,
#     x_label="Age Group",
#     y_label="Number of Respondents",
#     title="Age Distribution",
#     filename="age_distribution.png"
# )
education_labels_dict = {
    1: "OND",
    2: "B.Sc./HND",
    3: "PGD",
    4: "MSc.",
    5: "Ph.D."
}
# plot_categorical_distribution(
#     df=df,
#     column="D_Education",
#     x_labels_dict=education_labels_dict,
#     x_label="Educational Qualification",
#     y_label="Number of Respondents",
#     title="Educational Qualification Distribution",
#     filename="education_distribution.png"
# )
position_labels_dict = {
    1: "Junior Staff",
    2: "Senior Staff"
}
# plot_categorical_distribution(
#     df=df,
#     column="D_Position",
#     x_labels_dict=position_labels_dict,
#     x_label="Staff Position",
#     y_label="Number of Respondents",
#     title="Position Distribution",
#     filename="position_distribution.png"
# )
experience_labels_dict = {
    1: "<5 years",
    2: "6-10 years",
    3: "11-15 years",
    4: "16-20 years",
    5: "Over 20 years"
}
# plot_categorical_distribution(
#     df=df,
#     column="D_Experience",
#     x_labels_dict=experience_labels_dict,
#     x_label="Years of Experience",
#     y_label="Number of Respondents",
#     title="Experience Distribution",
#     filename="experience_distribution.png"
# )
company_age_labels_dict = {
    1: "1-5 years",
    2: "6-10 years",
    3: "11-15 years",
    4: "16-20 years",
    5: "Over 20 years"
}
# plot_categorical_distribution(
#     df=df,
#     column="D_CompanyAge",
#     x_labels_dict=company_age_labels_dict,
#     x_label="Company Age",
#     y_label="Number of Respondents",
#     title="Company Age Distribution",
#     filename="company_age_distribution.png"
# )
employee_count_labels_dict = {
    1: "1-5",
    2: "6-10",
    3: "11-15",
    4: "16-20",
    5: "Above 20"
}
# plot_categorical_distribution(
#     df=df,
#     column="D_NumEmployees",
#     x_labels_dict=employee_count_labels_dict,
#     x_label="Number of Employees",
#     y_label="Number of Respondents",
#     title="Employee Count Distribution",
#     filename="employee_count_distribution.png"
# )



#Area plot for Usage fields
usage_cols = [col for col in df.columns if col.startswith(USAGE)]

usage_distribution = pd.DataFrame(index=range(1, 6))

for col in usage_cols:
    # Get value counts in order 1-5
    counts = df[col].value_counts().sort_index()
    percentages = (counts / counts.sum() * 100).round(0).astype(int).astype(str) + '%'
    
    # Combine counts and percentages
    combined = counts.astype(str) + ' (' + percentages + ')'
    
    # Add to our distribution table
    usage_distribution[col] = combined

usage_distribution = usage_distribution.T

# Optionally: Rename the index for clarity
usage_distribution.columns = [
    'Very Low (1)',
    'Low (2)',
    'Moderate (3)',
    'High (4)',
    'Very High (5)'
]

usage_distribution['Total'] = df[usage_cols].count()

print(usage_distribution)

# for col in usage_cols:
#     counts = df[col].value_counts().reindex(range(1, 6), fill_value=0)
#     usage_distribution[col] = counts

# plt.figure(figsize=(12, 6))
# for col in usage_distribution.columns:
#     plt.fill_between(usage_distribution.index, usage_distribution[col], alpha=0.15)
#     plt.plot(usage_distribution.index, usage_distribution[col], label=col)

# plt.title("Digital Technology Usage Distribution", fontsize=14, weight='bold')
# plt.xlabel("Response Level (5 = Very High ... 1 = Very Low)", fontsize=12)
# plt.ylabel("Number of Respondents", fontsize=12)
# plt.xticks([1, 2, 3, 4, 5])
# plt.grid(True, linestyle='--', alpha=0.6)
# plt.legend(loc='upper right', fontsize=9)
# sns.despine()
# plt.tight_layout()

# plt.savefig(f"{FIG_DIR}/usage_area_chart.png", dpi=300, bbox_inches='tight')
# plt.show()
# plt.close()


# Data Reliability and Validity
### Cronbach’s Alpha for Reliability
To access internal consistency of sections

In [None]:
#data reliability and validity
import pingouin as pg
adoption_cols = [col for col in df.columns if col.startswith(ADOPTION)]
project_delivery_cols = [col for col in df.columns if col.startswith(PROJECT_DELIVERY)]
sustainability_cols = [col for col in df.columns if col.startswith(SUSTAINABILITY)]
barriers_cols = [col for col in df.columns if col.startswith(BARRIERS)]


a_alpha, a_ci = pg.cronbach_alpha(data=df[adoption_cols])
p_alpha, p_ci = pg.cronbach_alpha(data=df[project_delivery_cols])
s_alpha, s_ci = pg.cronbach_alpha(data=df[sustainability_cols])
b_alpha, b_ci = pg.cronbach_alpha(data=df[barriers_cols])

print(f"Cronbach's alpha for adoption = {a_alpha:.3f} (95% CI: {a_ci[0]:.3f} - {a_ci[1]:.3f})")
print(f"Cronbach's alpha for project delivery = {p_alpha:.3f} (95% CI: {p_ci[0]:.3f} - {p_ci[1]:.3f})")
print(f"Cronbach's alpha for sustainability = {s_alpha:.3f} (95% CI: {s_ci[0]:.3f} - {s_ci[1]:.3f})")
print(f"Cronbach's alpha for barriers = {b_alpha:.3f} (95% CI: {b_ci[0]:.3f} - {b_ci[1]:.3f})")


# Exploratory Factor Analysis (EFA)
Exploratory Factor Analysis (EFA) is a key step in identifying underlying latent variables that explain the observed correlations between measured variables. We will use this to interpret the loadings.

In [None]:
from factor_analyzer import FactorAnalyzer
from sklearn.preprocessing import StandardScaler


### Factor Analysis Function

In [None]:
def run_factor_analysis(df, cols, n_factors=5, rotation='varimax'):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[cols])

    fa = FactorAnalyzer(n_factors=n_factors, rotation=rotation)
    fa.fit(scaled_data)

    eigenvalues = fa.get_eigenvalues()
    loadings = fa.loadings_

    plt.figure(figsize=(8, 6))
    plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='--')
    plt.title(f"Scree Plot for {', '.join(cols)}")
    plt.xlabel('Factors')
    plt.ylabel('Eigenvalue')
    plt.grid(True)
    plt.show()

    loadings_df = pd.DataFrame(loadings, index=cols)
    print(f'Factor Loadings for {", ".join(cols)}:\n', loadings_df)

    return fa, loadings_df

### Factor Analysis

In [None]:
# fa_adoption, loadings_adoption = run_factor_analysis(df, adoption_cols)
# fa_project_delivery, loadings_project_delivery = run_factor_analysis(df, project_delivery_cols)
# fa_sustainability, loadings_sustainability = run_factor_analysis(df, sustainability_cols)
fa_barriers, loadings_barriers = run_factor_analysis(df, barriers_cols)
print(fa_barriers, loadings_barriers)

## Principal Component Analysis (PCA)
PCA reduces the dimensionality of the data while retaining as much variance as possible

In [None]:
from sklearn.decomposition import PCA

### PCA function

In [None]:
def run_pca(df, cols, n_components=5):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[cols])

    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(scaled_data)
    print(pca_result)

    explained_variance = pca.explained_variance_ratio_
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--')
    plt.title(f"Explained Variance by Principal Components for {', '.join(cols)}")
    plt.xlabel('Principal Components')
    plt.ylabel('Explained Variance Ratio')
    plt.grid(True)
    plt.show()

    return pca, explained_variance, pca_result


### PCA's

In [None]:
pca_adoption, explained_variance_adoption, pca_result_adoption = run_pca(df, adoption_cols)
pca_project_delivery, explained_variance_project_delivery, pca_result_project_delivery = run_pca(df, project_delivery_cols)
pca_sustainability, explained_variance_sustainability, pca_result_sustainability = run_pca(df, sustainability_cols)
pca_barriers, explained_variance_barriers, pca_result_barriers = run_pca(df, barriers_cols)

### Factor Loadings Visualization

In [None]:
def plot_factor_loadings(loadings_df, title='Factor Loadings'):
    plt.figure(figsize=(10, min(0.5 * len(loadings_df), 12)))
    sns.heatmap(loadings_df, annot=True, cmap='coolwarm', center=0, fmt=".2f")
    plt.title(title, fontsize=14, weight='bold')
    plt.xlabel("Factors")
    plt.ylabel("Items")
    plt.tight_layout()
    plt.show()

# plot_factor_loadings(loadings_adoption, title="Adoption - Factor Loadings")
# plot_factor_loadings(loadings_project_delivery, title="Project delivery - Factor Loadings")
# plot_factor_loadings(loadings_sustainability, title="Sustainability - Factor Loadings")
plot_factor_loadings(loadings_barriers, title="Barriers - Factor Loadings")

### PCA Biplot for 2D Projection

In [None]:
def pca_biplot(pca, components, features, labels=None, title='PCA Biplot'):
    plt.figure(figsize=(10, 7))
    
    plt.scatter(components[:, 0], components[:, 1], alpha=0.5, label='Respondents')
    
    feature_vectors = pca.components_[:2].T 
    for i, v in enumerate(feature_vectors):
        plt.arrow(0, 0, v[0]*3, v[1]*3, 
                  color='r', alpha=0.5, head_width=0.05)
        plt.text(v[0]*3.2, v[1]*3.2, labels[i] if labels else f"Var{i+1}", 
                 color='black', fontsize=9)

    plt.axhline(0, linestyle='--', color='gray', linewidth=0.5)
    plt.axvline(0, linestyle='--', color='gray', linewidth=0.5)
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.title(title, fontsize=14, weight='bold')
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()
    
pca_biplot(
    pca=pca_adoption,
    components=pca_result_adoption,
    features=adoption_cols,
    labels=adoption_cols,
    title="PCA Biplot - Adoption"
)

pca_biplot(
    pca=pca_project_delivery,
    components=pca_result_project_delivery,
    features=project_delivery_cols,
    labels=project_delivery_cols,
    title="PCA Biplot - Project delivery"
)

pca_biplot(
    pca=pca_sustainability,
    components=pca_result_sustainability,
    features=sustainability_cols,
    labels=sustainability_cols,
    title="PCA Biplot - Sustainability"
)

pca_biplot(
    pca=pca_barriers,
    components=pca_result_barriers,
    features=barriers_cols,
    labels=barriers_cols,
    title="PCA Biplot - Barriers"
)


# 6 Inferential Statistics

### 6.1 Correlation Analysis — Adoption, Sustainability, Delivery

In [None]:
df["Adoption_Avg"] = df[adoption_cols].mean(axis=1)
df["Sustainability_Avg"] = df[sustainability_cols].mean(axis=1)
df["Delivery_Avg"] = df[project_delivery_cols].mean(axis=1)


corr_matrix = df[["Adoption_Avg", "Sustainability_Avg", "Delivery_Avg"]].corr()
print(corr_matrix)


sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Matrix: Adoption, Sustainability, Delivery")
plt.show()

### 6.2 Group Comparisons - T-Tests or ANOVA
Test if adoption levels differ significantly between:
    - staff positions (`D_Position`)
    - Years of experience (`D_Experience`)

In [None]:
from scipy.stats import ttest_ind
import statsmodels.api as sm
from statsmodels.formula.api import ols

junior = df[df["D_Position"] == 1]["Adoption_Avg"]
senior = df[df["D_Position"] == 2]["Adoption_Avg"]

t_stat, p_val = ttest_ind(junior, senior, equal_var=False)
print(f"T-test (Position): t={t_stat:.3f}, p={p_val:.3f}")


anova = ols('Adoption_Avg ~ C(D_Experience)', data=df).fit()
anova_table = sm.stats.anova_lm(anova, typ=2)
print(anova_table)

### 6.3 Multiple Regression — Predicting Delivery from Adoption
Use adoption score to predict project delivery performance.

In [None]:
# Independent variables
X = df[["Adoption_Avg"]]
X = sm.add_constant(X)

# Dependent variable
y = df["Delivery_Avg"]

# Fit regression model
model = sm.OLS(y, X).fit()
print(model.summary())


# with demographic controls

X = df[["Adoption_Avg", "D_Position", "D_Experience"]]
X = sm.add_constant(X)
model_with_controls = sm.OLS(y, X).fit()
print(model_with_controls.summary())

# 7 Machine Learning Models

In [None]:
import sys
sys.path.append(os.path.abspath(os.path.join("..")))

from models.classification import classify_adoption_delivery
from models.clustering import cluster_respodents
from models.decision_tree import decision_tree_adoption
from models.evaluate import evaluate_model

def run_model_analysis(df):
    """
    Executes all ML model workflows from Phase 7 on the provided DataFrame.
    Returns a dictionary of results from classification, clustering, decision tree, and evaluation.
    """

    results = {}

    # --- 1. Classification: Predict high vs. low delivery ---
    print("Running classification (Random Forest)...")
    classification_report_data, clf = classify_adoption_delivery(df)
    results['classification'] = {
        "model": clf,
        "report": classification_report_data
    }

    # --- 2. Clustering: Segment firms/respondents ---
    print("Running clustering (KMeans)...")
    df, kmeans = cluster_respodents(df)
    cluster_counts = df["Cluster"].value_counts().to_dict()
    results['clustering'] = {
        "model": kmeans,
        "cluster_distribution": cluster_counts
    }

    # --- 3. Decision Tree: Identify adoption success rules ---
    print("Training decision tree model...")
    tree_model, feature_names = decision_tree_adoption(df)
    results['decision_tree'] = {
        "model": tree_model,
        "features": list(feature_names)
    }

    # --- 4. Evaluation Metrics for Classification ---
    print("Calculating evaluation metrics...")
    # Reuse X/y split for prediction
    X = df[["Adoption_Avg", "D_Position", "D_Experience"]]
    y_true = (df["Delivery_Avg"] > df["Delivery_Avg"].median()).astype(int)
    y_pred = clf.predict(X)

    eval_scores = evaluate_model(y_true, y_pred)
    results['evaluation'] = eval_scores

    print("Phase 7 analysis complete.\n")
    return df, results


print(run_model_analysis(df))

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, confusion_matrix, silhouette_score)
import numpy as np

# --- 7.1 Classification: Predict high vs. low delivery performance ---
def classify_performance(df):
    """Predict project delivery performance (high/low) based on adoption"""
    # Create target: 1 if delivery above median, else 0
    df['Delivery_High'] = (df['Delivery_Avg'] > df['Delivery_Avg'].median()).astype(int)
    
    # Features: Adoption factors and key demographics
    features = ['Adoption_Avg', 'D_Position', 'D_Experience', 'D_Education']
    X = df[features]
    y = df['Delivery_High']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print("\nClassification Results:")
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")
    
    # Feature importance
    importances = pd.Series(model.feature_importances_, index=features)
    plt.figure(figsize=(10, 6))
    importances.sort_values().plot.barh(color=PLOT_PALATTE[0])
    plt.title("Feature Importance for Delivery Prediction", fontsize=14)
    plt.xlabel("Importance Score")
    plt.tight_layout()
    plt.savefig(f"{FIG_DIR}/feature_importance_delivery.png", dpi=300)
    plt.show()
    
    return model, accuracy

# --- 7.2 Clustering: Segment firms/respondents ---
def cluster_respondents(df):
    """Cluster respondents based on adoption, delivery, and sustainability"""
    # Prepare clustering features
    cluster_features = ['Adoption_Avg', 'Delivery_Avg', 'Sustainability_Avg']
    X = df[cluster_features]
    
    # Find optimal clusters using silhouette score
    silhouette_scores = []
    cluster_range = range(2, 6)
    
    for n in cluster_range:
        kmeans = KMeans(n_clusters=n, random_state=42, n_init=10)
        labels = kmeans.fit_predict(X)
        silhouette_scores.append(silhouette_score(X, labels))
    
    # Plot silhouette scores
    plt.figure(figsize=(8, 5))
    plt.plot(cluster_range, silhouette_scores, 'bo-')
    plt.xlabel("Number of Clusters")
    plt.ylabel("Silhouette Score")
    plt.title("Optimal Cluster Selection", fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{FIG_DIR}/silhouette_scores.png", dpi=300)
    plt.show()
    
    # Apply best clustering
    optimal_clusters = cluster_range[np.argmax(silhouette_scores)]
    kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
    df['Cluster'] = kmeans.fit_predict(X)
    
    # Visualize clusters
    plt.figure(figsize=(10, 7))
    sns.scatterplot(
        x='Adoption_Avg', 
        y='Delivery_Avg', 
        hue='Cluster', 
        data=df, 
        palette=PLOT_PALATTE,
        s=100,
        alpha=0.8
    )
    plt.title(f"Respondent Clusters (k={optimal_clusters})", fontsize=14)
    plt.xlabel("Adoption Level")
    plt.ylabel("Delivery Performance")
    plt.grid(True, alpha=0.2)
    plt.tight_layout()
    plt.savefig(f"{FIG_DIR}/respondent_clusters.png", dpi=300)
    plt.show()
    
    # Cluster profiles
    cluster_profiles = df.groupby('Cluster')[cluster_features].mean()
    print("\nCluster Profiles:")
    print(cluster_profiles)
    
    return df, kmeans

# --- 7.3 Decision Tree: Identify adoption success rules ---
def adoption_success_rules(df):
    """Identify decision rules for successful technology adoption"""
    # Create target: 1 if adoption above median
    df['Adoption_High'] = (df['Adoption_Avg'] > df['Adoption_Avg'].median()).astype(int)
    
    # Features: Barriers and demographics
    barrier_features = [col for col in barriers_cols]
    features = barrier_features + ['D_Position', 'D_Experience', 'D_CompanyAge']
    X = df[features]
    y = df['Adoption_High']
    
    # Train decision tree
    tree = DecisionTreeClassifier(
        max_depth=3,  # Limit depth for interpretability
        min_samples_leaf=10,
        random_state=42
    )
    tree.fit(X, y)
    
    # Visualize tree
    plt.figure(figsize=(20, 10))
    plot_tree(
        tree, 
        feature_names=features, 
        class_names=['Low Adoption', 'High Adoption'],
        filled=True,
        proportion=True,
        rounded=True,
        fontsize=10
    )
    plt.title("Decision Tree for Adoption Success", fontsize=14)
    plt.tight_layout()
    plt.savefig(f"{FIG_DIR}/adoption_decision_tree.png", dpi=300)
    plt.show()
    
    # Extract rules
    tree_rules = export_text(tree, feature_names=features)
    print("\nDecision Tree Rules:")
    print(tree_rules)
    
    return tree

# --- 7.4 Evaluate Models ---
def evaluate_classification(model, X_test, y_test):
    """Evaluate classification model performance"""
    y_pred = model.predict(X_test)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig(f"{FIG_DIR}/confusion_matrix.png", dpi=300)
    plt.show()
    
    # Classification report
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print("\nModel Evaluation:")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1 Score: {f1:.3f}")
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# --- Execute all ML workflows ---
def run_ml_pipeline(df):
    print("===== MACHINE LEARNING ANALYSIS =====")
    
    # 7.1 Classification
    print("\nRunning performance classification...")
    clf_model, clf_accuracy = classify_performance(df)
    
    # 7.2 Clustering
    print("\nClustering respondents...")
    df, cluster_model = cluster_respondents(df)
    
    # 7.3 Decision Tree
    print("\nExtracting adoption rules...")
    tree_model = adoption_success_rules(df)
    
    # 7.4 Evaluation (using classification model)
    print("\nEvaluating classification model...")
    # Prepare test data for evaluation
    features = ['Adoption_Avg', 'D_Position', 'D_Experience', 'D_Education']
    X = df[features]
    y = (df['Delivery_Avg'] > df['Delivery_Avg'].median()).astype(int)
    _, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    metrics = evaluate_classification(clf_model, X_test, y_test)
    
    print("\nMachine Learning Pipeline Complete!")
    return df, {
        'classification_model': clf_model,
        'clustering_model': cluster_model,
        'decision_tree': tree_model,
        'metrics': metrics
    }

# Execute the full pipeline
df, ml_results = run_ml_pipeline(df)

print(ml_results)