# 1. Imports & Setup

In [8]:
import os, ast, re, json, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

OUTDIR = Path("analysis_outputs")
OUTDIR.mkdir(exist_ok=True)
print("Output dir:", OUTDIR.resolve())

Output dir: C:\Users\aayus\Downloads\DES646 Course Project\analysis_outputs


# 2. Load & Basic Clean

In [9]:
dataset_path = "materials_final_with_price.csv"

print("Loading:", dataset_path)
df = pd.read_csv(dataset_path, dtype=str)
print("Shape:", df.shape)

if "Material Name" not in df.columns:
    for cand in ["material", "Material"]:
        if cand in df.columns:
            df = df.rename(columns={cand: "Material Name"})
            break
if "Material Name" not in df.columns:
    raise ValueError("No 'Material Name' column detected.")

def parse_categories(x):
    if isinstance(x, list): return x
    s = "" if pd.isna(x) else str(x)
    if s.strip().startswith("[") and s.strip().endswith("]"):
        try:
            lst = ast.literal_eval(s)
            return [str(t).strip() for t in lst]
        except Exception:
            pass
    return [p.strip() for p in s.split(";") if p.strip()]

df["__categories_list"] = df["Categories"].apply(parse_categories) if "Categories" in df.columns else [[]]*len(df)
print("Example categories:", df["__categories_list"].head().tolist())


Loading: materials_final_with_price.csv
Shape: (2456, 34)
Example categories: [['Ceramic', 'Glass'], ['Ceramic', 'Oxide'], ['Metal', 'Nonferrous Metal', 'Pure Element'], ['Ceramic', 'Oxide', 'Aluminum Oxide'], ['Ceramic', 'Oxide']]


# 3. Coerce mixed strings to numeric

In [10]:
text_like = {"Material Name","Categories","Material Notes","Thermal Properties","Optical Properties","__categories_list"}
candidate_num_cols = [c for c in df.columns if c not in text_like]

num_extract = re.compile(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?")

def coerce_numeric(val):
    if val is None or (isinstance(val, float) and math.isnan(val)):
        return np.nan
    s = str(val)
    m = num_extract.search(s)
    if not m: 
        return np.nan
    try:
        return float(m.group(0))
    except Exception:
        return np.nan

for c in candidate_num_cols:
    df[c] = df[c].apply(coerce_numeric)

print("Coerced numerics on candidate columns:", len(candidate_num_cols))
df.to_csv(OUTDIR/"_stage1_coerced.csv", index=False)

Coerced numerics on candidate columns: 30


# 4. Category mapping

In [11]:
num_cols = [c for c in df.columns if c not in {"Material Name","Categories","Material Notes","__categories_list"} and pd.api.types.is_numeric_dtype(df[c])]
from collections import defaultdict

cat_means = {col: defaultdict(list) for col in num_cols}
for _, r in df.iterrows():
    cats = r["__categories_list"] if isinstance(r["__categories_list"], list) else []
    for col in num_cols:
        v = r[col]
        if not pd.isna(v):
            for cat in cats:
                cat_means[col][cat].append(v)

for col in num_cols:
    for cat, vals in cat_means[col].items():
        cat_means[col][cat] = float(np.nanmean(vals)) if len(vals) else np.nan

global_means = {col: float(np.nanmean(df[col])) for col in num_cols}

# 5. Category-weighted Imputation

In [12]:
def impute_row(row):
    cats = row["__categories_list"] if isinstance(row["__categories_list"], list) else []
    weights = [0.6, 0.3, 0.1]
    out = {}
    for col in num_cols:
        v = row[col]
        if not pd.isna(v):
            out[col] = v
            continue
        acc, wsum = 0.0, 0.0
        for i, cat in enumerate(cats[:3]):
            w = weights[i]
            m = cat_means[col].get(cat, np.nan)
            if not (m is None or np.isnan(m)):
                acc += w*m
                wsum += w
        out[col] = (acc/wsum) if wsum>0 else global_means[col]
    return pd.Series(out)

imputed_vals = df.apply(impute_row, axis=1)
for col in num_cols:
    df[col] = imputed_vals[col]

df.to_csv(OUTDIR/"_stage2_imputed.csv", index=False)
print("Imputation complete.")

Imputation complete.


# 6. Scaling and PCA

In [21]:
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(df[num_cols])

scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_imp)

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

exclude_for_dr = {'Cost_USD_per_kg', 'CO2_kg_per_kg', 'Cost_per_CO2', 'Eco_Index'}
dr_cols = [c for c in num_cols if c not in exclude_for_dr]

X_dr = pd.DataFrame(X_scaled, columns=num_cols)[dr_cols].values

pca = PCA(n_components=2, random_state=42)
pca_xy = pca.fit_transform(X_dr)

plt.figure()
plt.scatter(df["PC1"], df["PC2"], s=8)
plt.xlabel("PC1"); plt.ylabel("PC2"); plt.title("PCA (2D)")
plt.tight_layout(); plt.savefig(OUTDIR/"pca_2d.png", dpi=200); plt.close()

# 7. KMeans (auto-k by silhouette)

In [14]:
best_k, best_s = None, -1
for k in range(2, 9):
    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
    labels = km.fit_predict(X_scaled)
    try:
        score = silhouette_score(X_scaled, labels)
    except Exception:
        score = -1
    if score > best_s:
        best_s, best_k = score, k

km = KMeans(n_clusters=best_k, random_state=42, n_init="auto")
df["Cluster"] = km.fit_predict(X_scaled)
plt.figure()
plt.scatter(df["PC1"], df["PC2"], s=8, c=df["Cluster"])
plt.xlabel("PC1"); plt.ylabel("PC2"); plt.title(f"PCA by KMeans (k={best_k})")
plt.tight_layout(); plt.savefig(OUTDIR/"pca_kmeans.png", dpi=200); plt.close()
print("Best k:", best_k, "silhouette:", best_s)

Best k: 3 silhouette: 0.9933995421162397


# 8. Correlation Matrix

In [15]:
corr = pd.DataFrame(X_scaled, columns=num_cols).corr()
plt.figure(figsize=(8,6))
plt.imshow(corr.values, aspect="auto")
plt.colorbar()
plt.xticks(range(len(num_cols)), num_cols, rotation=90)
plt.yticks(range(len(num_cols)), num_cols)
plt.title("Correlation (scaled)")
plt.tight_layout(); plt.savefig(OUTDIR/"correlation_matrix.png", dpi=200); plt.close()

# 9. Ashby Plot

In [16]:
def ashby(xcol, ycol, xlog=True, ylog=True, fname="ashby.png", title=None):
    if xcol not in df.columns or ycol not in df.columns: 
        print("Missing:", xcol, ycol); return
    x = pd.to_numeric(df[xcol], errors="coerce")
    y = pd.to_numeric(df[ycol], errors="coerce")
    m = ~x.isna() & ~y.isna()
    if m.sum()==0: 
        print("No valid points for", xcol, ycol); return
    plt.figure()
    plt.scatter(x[m], y[m], s=8)
    if xlog: plt.xscale("log")
    if ylog: plt.yscale("log")
    plt.xlabel(xcol); plt.ylabel(ycol)
    if title: plt.title(title)
    plt.tight_layout(); plt.savefig(OUTDIR/fname, dpi=200); plt.close()

ashby("Density","UTS", True, True, "ashby_uts_density.png", "Ashby: UTS vs Density")
ashby("Density","Elastic Modulus", True, True, "ashby_E_density.png", "Ashby: Elastic Modulus vs Density")

Missing: Density UTS
Missing: Density Elastic Modulus


# 10. TOPIS Ranking

In [17]:
from sklearn.impute import SimpleImputer
import numpy as np

topsis_candidates = [
    "UTS", "Elastic Modulus", "Strength_to_Weight", "Specific_Stiffness",
    "Thermal Conductivity", "Density", "Cost_USD_per_kg", "CO2_kg_per_kg"
]
features = [c for c in topsis_candidates if c in df.columns]

print("TOPSIS features (present):", features)
if features:
    X = df[features].apply(pd.to_numeric, errors="coerce").values

    imp = SimpleImputer(strategy="median")
    X = imp.fit_transform(X)

    col_norm = np.linalg.norm(X, axis=0)
    col_norm[col_norm == 0] = 1.0
    R = X / col_norm

    dirs = np.array([ -1 if f in ["Density", "Cost_USD_per_kg", "CO2_kg_per_kg"] else 1
                      for f in features ], dtype=float)

    weights = []
    for f in features:
        if f in ["UTS", "Elastic Modulus", "Strength_to_Weight", "Specific_Stiffness"]:
            weights.append(1.0)
        elif f == "Thermal Conductivity":
            weights.append(0.9)
        elif f in ["Cost_USD_per_kg", "CO2_kg_per_kg"]:
            weights.append(0.8)
        else:
            weights.append(0.6)
    w = np.array(weights, dtype=float)
    w = w / w.sum()

    V = R * w

    ideal_best  = np.where(dirs ==  1, V.max(axis=0), V.min(axis=0))
    ideal_worst = np.where(dirs == -1, V.max(axis=0), V.min(axis=0))

    d_pos = np.linalg.norm(V - ideal_best,  axis=1)
    d_neg = np.linalg.norm(V - ideal_worst, axis=1)
    score = d_neg / (d_pos + d_neg + 1e-12)

    df_rank = pd.DataFrame({"Material Name": df.get("Material Name", pd.Series(range(len(score))))})
    for i, f in enumerate(features):
        df_rank[f] = X[:, i]
    df_rank["TOPSIS_score"] = score
    df_rank = df_rank.sort_values("TOPSIS_score", ascending=False)

    out_path = OUTDIR / "materials_ranking_with_cost_env.csv"
    df_rank.to_csv(out_path, index=False)
    print("Saved:", out_path)

    plt.figure()
    plt.hist(score, bins=40)
    plt.xlabel("TOPSIS score"); plt.ylabel("Count")
    plt.title("TOPSIS score distribution (with Cost & CO2)")
    plt.tight_layout(); plt.savefig(OUTDIR/"topsis_distribution_cost_env.png", dpi=200); plt.close()
else:
    print("No valid TOPSIS features found; skipped.")

TOPSIS features (present): ['Thermal Conductivity', 'Density', 'CO2_kg_per_kg']
Saved: analysis_outputs\materials_ranking_with_cost_env.csv


# 11. Category-level summaries

In [18]:
def category_summary(df_in, cols):
    rows = []
    for _, r in df_in.iterrows():
        cats = r["__categories_list"] if isinstance(r["__categories_list"], list) else []
        for cat in cats:
            e = {"Category": cat}
            for c in cols: e[c] = r.get(c, np.nan)
            rows.append(e)
    if not rows: 
        return pd.DataFrame(columns=["Category"]+cols)
    temp = pd.DataFrame(rows)
    return temp.groupby("Category")[cols].mean().reset_index()

summary_cols = [c for c in ["UTS","Elastic Modulus","Density","Thermal Conductivity","Cost_USD_per_kg","Eco_Index"] if c in df.columns]
cat_mean = category_summary(df, summary_cols)
cat_mean.to_csv(OUTDIR/"category_means.csv", index=False)
print("Saved:", OUTDIR/"category_means.csv")

df.to_csv(OUTDIR/"_final_analysis_table.csv", index=False)
print("Saved:", OUTDIR/"_final_analysis_table.csv")

Saved: analysis_outputs\category_means.csv
Saved: analysis_outputs\_final_analysis_table.csv


# 11. Final Table

In [None]:
df.to_csv(OUTDIR/'_final_analysis_table.csv', index=False)
print('Saved:', OUTDIR/'_final_analysis_table.csv')
print(df[['Material Name'] + [c for c in ['UTS','Elastic Modulus','Density','Cost_USD_per_kg','Eco_Index','PC1','PC2','Cluster'] if c in df.columns]].head(10))

Saved: analysis_outputs\_final_analysis_table.csv
                                       Material Name    Density           PC1  \
0               Schott Glass 8347 Borosilicate Glass   2.230000  4.227790e-06   
1  Morgan Advanced Ceramics Superwool® Plus Tank ...  52.564922  3.164500e-07   
2                                            Hafnium  13.310000 -3.489278e-06   
3  Chosun Refractories HSC-18RA Dense and High-St...  61.029983 -4.120852e-07   
4                   MarkeTech CWO (Cadmium tungstate   7.900000  3.164752e-07   
5                                    AISI 1137 Steel  25.912931 -1.453258e+03   
6                                   AISI 4320H Steel  25.912931  1.546742e+03   
7         Advanced Ceramics ACL 1091 (C-799) Alumina  61.029983 -4.120833e-07   
8                     Mateck Gallium Arsenide (GaAs)   5.316000  1.163711e-07   
9  Aremco Ceramabind™ 642-A High Temperature Inor...  44.694031  1.504707e-06   

        PC2  Cluster  
0 -1.718092        0  
1  0.167245 