## Senario B - Task 1

In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import re
import os
import re
from pathlib import Path

# Warnings filtering.
import warnings
warnings.filterwarnings('ignore')

In [67]:
rfq_path = (r"C:\Users\black\vanilla-steel-assessment\data\rfq.csv")
ref_path = (r"C:\Users\black\vanilla-steel-assessment\data\reference_properties.tsv")

In [68]:
df_rfq = pd.read_csv(rfq_path)
df_ref = pd.read_csv(ref_path, sep="\t")

In [69]:
df_ref.head(3)

Unnamed: 0,Grade/Material,UNS_No,Steel_No,Standards,Carbon (C),Manganese (Mn),Silicon (Si),Sulfur (S),Phosphorus (P),Chromium (Cr),...,Reduction of area (Z%),"Hardness (HB, HV, HRC)",Impact toughness (Charpy V-notch),Fatigue limit,Creep resistance,Source_Pages,Application,Category,Nb + V + Ti (Others),Coating
0,S235JR,,,EN 10025-2:2019,≤0.17,≤1.40,≤0.40,≤0.035,≤0.035,,...,,,27J at 20°C,,,Standard Specifications,General structural steels,Structural Steel,,
1,S275JR,,,EN 10025-2:2019,≤0.21,≤1.50,≤0.40,≤0.035,≤0.035,,...,,,27J at 20°C,,,Standard Specifications,General structural steels,Structural Steel,,
2,S355JR,,,EN 10025-2:2019,≤0.24,≤1.60,≤0.55,≤0.035,≤0.035,,...,,,27J at 20°C,,,Standard Specifications,General structural steels,Structural Steel,,


In [70]:
df_rfq.head(3)

Unnamed: 0,id,grade,grade_suffix,coating,finish,surface_type,surface_protection,form,thickness_min,thickness_max,...,weight_min,weight_max,inner_diameter_min,inner_diameter_max,outer_diameter_min,outer_diameter_max,yield_strength_min,yield_strength_max,tensile_strength_min,tensile_strength_max
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,S700MC,,,Oiled,,,Coils,6.0,6.0,...,15000.0,25000.0,610.0,610.0,,,,,760.0,810.0
1,37e624be-b125-464f-85b6-1838530193ef,S250GD,,ZM310,Hot-dip zinc magnesium (+ZM),,,Slit Coils,1.5,1.5,...,,,,,,,,,,
2,b8257184-6307-46ab-b06e-d979336d1263,DX51D,,Z100,Hot-dip Galvanized (+Z/+GI),,Lightly Oiled (L),Coils,0.4,0.4,...,,,,,,,,,,


In [None]:
def normalize_grade(grade):
    if pd.isna(grade):
        return None

    grade = str(grade).strip().upper()
    grade = grade.replace(" ", "")
    grade = grade.replace("+", "")

    # remove suffixes if needed
    suffixes_to_remove = ["JR", "J0", "J2", "G3"]
    for suffix in suffixes_to_remove:
        if grade.endswith(suffix):
            grade = grade.replace(suffix, "")
    
    return grade

In [72]:
# Normalize RFQ grades
df_rfq["grade"] = df_rfq["grade"].apply(normalize_grade)

# Normalize reference grades
df_ref["grade"] = df_ref["Grade/Material"].apply(normalize_grade)

In [73]:
df_joined = df_rfq.merge(df_ref, on="grade", how="left", suffixes=("", "_ref"))

In [74]:
df_joined.head(3)

Unnamed: 0,id,grade,grade_suffix,coating,finish,surface_type,surface_protection,form,thickness_min,thickness_max,...,Reduction of area (Z%),"Hardness (HB, HV, HRC)",Impact toughness (Charpy V-notch),Fatigue limit,Creep resistance,Source_Pages,Application,Category,Nb + V + Ti (Others),Coating
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,S700MC,,,Oiled,,,Coils,6.0,6.0,...,,,,,,Standard Specifications,"Cold forming, automotive, high strength applic...",Microalloyed Steel,≤0.22,
1,37e624be-b125-464f-85b6-1838530193ef,S250GD,,ZM310,Hot-dip zinc magnesium (+ZM),,,Slit Coils,1.5,1.5,...,,,,,,Standard Specifications,Structural galvanized steel,Galvanized Steel,,Hot-dip galvanized
2,b8257184-6307-46ab-b06e-d979336d1263,DX51D,,Z100,Hot-dip Galvanized (+Z/+GI),,Lightly Oiled (L),Coils,0.4,0.4,...,,,,,,Standard Specifications,Galvanized steel for forming,Galvanized Steel,,Hot-dip galvanized


In [75]:
# Check how many RFQs didn’t match any grade in reference
missing_count = df_joined['Grade/Material'].isna().sum()
print(f"RFQs with missing grade match: {missing_count}")

# Optionally: add a column to flag it
df_joined['grade_match_missing'] = df_joined['Grade/Material'].isna()

RFQs with missing grade match: 59


In [76]:
categorical_cols = [
    "Standards", "Application", "Category"
]

for col in categorical_cols:
    if col in df_joined.columns:
        df_joined[col] = df_joined[col].fillna("Unknown")

In [77]:
import re
import numpy as np

def parse_numeric_range(value):
    if pd.isna(value):
        return np.nan
    value = str(value).strip()

    # Match a single number or a range like "0.15–0.25"
    match = re.match(r'^(\d*\.?\d+)\s*[–-]\s*(\d*\.?\d+)$', value)  # range
    if match:
        low = float(match.group(1))
        high = float(match.group(2))
        return (low + high) / 2

    # Match a value with ≤ or ≥ or ≈ etc.
    match = re.match(r'^[≤≥<>≈~]*\s*(\d*\.?\d+)$', value)
    if match:
        return float(match.group(1))

    # Try just parsing it directly
    try:
        return float(value)
    except:
        return np.nan

In [None]:
# Apply to Carbon (C) column    
df_joined['Carbon_clean'] = df_joined['Carbon (C)'].apply(parse_numeric_range)

# impute
median_val = df_joined['Carbon_clean'].median()
df_joined['Carbon_clean'] = df_joined['Carbon_clean'].fillna(median_val)


In [79]:
raw_cols = ['Carbon (C)', 'Manganese (Mn)', 'Silicon (Si)', 'Phosphorus (P)']
for col in raw_cols:
    new_col = col.replace(" ", "_").replace("(", "").replace(")", "") + "_clean"
    df_joined[new_col] = df_joined[col].apply(parse_numeric_range)
    median_val = df_joined[new_col].median()
    df_joined[new_col] = df_joined[new_col].fillna(median_val)

In [80]:
df_joined[['Carbon (C)', 'Carbon_C_clean']].head()

Unnamed: 0,Carbon (C),Carbon_C_clean
0,≤0.12,0.12
1,≤0.25,0.25
2,≤0.12,0.12
3,≤0.17,0.17
4,≤0.17,0.17


## Senario B - Task 2

In [81]:
# Create thickness point
df_joined["thickness_mid"] = (df_joined["thickness_min"] + df_joined["thickness_max"]) / 2

# Do the same for width and yield strength
df_joined["width_mid"] = (df_joined["width_min"] + df_joined["width_max"]) / 2
df_joined["yield_mid_rfq"] = (df_joined["yield_strength_min"] + df_joined["yield_strength_max"]) / 2
df_joined["tensile_mid_rfq"] = (df_joined["tensile_strength_min"] + df_joined["tensile_strength_max"]) / 2

In [82]:
def compute_iou(min1, max1, min2, max2):
    if pd.isna(min1) or pd.isna(max1) or pd.isna(min2) or pd.isna(max2):
        return 0.0
    inter = max(0, min(max1, max2) - max(min1, min2))
    union = max(max1, max2) - min(min1, min2)
    return inter / union if union != 0 else 0.0

In [83]:
null_ratio = df_joined.isnull().mean()
sparse_cols = null_ratio[null_ratio > 0.7].index.tolist()
df_joined = df_joined.drop(columns=sparse_cols)

In [84]:
df_joined.head(3)


Unnamed: 0,id,grade,finish,form,thickness_min,thickness_max,width_min,width_max,weight_min,weight_max,...,Application,Category,grade_match_missing,Carbon_clean,Carbon_C_clean,Manganese_Mn_clean,Silicon_Si_clean,Phosphorus_P_clean,thickness_mid,width_mid
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,S700MC,Oiled,Coils,6.0,6.0,600.0,1520.0,15000.0,25000.0,...,"Cold forming, automotive, high strength applic...",Microalloyed Steel,False,0.12,0.12,2.1,0.6,0.025,6.0,1060.0
1,37e624be-b125-464f-85b6-1838530193ef,S250GD,Hot-dip zinc magnesium (+ZM),Slit Coils,1.5,1.5,327.0,327.0,,,...,Structural galvanized steel,Galvanized Steel,False,0.25,0.25,1.2,0.5,0.12,1.5,327.0
2,b8257184-6307-46ab-b06e-d979336d1263,DX51D,Hot-dip Galvanized (+Z/+GI),Coils,0.4,0.4,1000.0,1500.0,,,...,Galvanized steel for forming,Galvanized Steel,False,0.12,0.12,0.6,0.5,0.12,0.4,1250.0


----

# Senario B - Task 2

In [85]:
print(df_joined.columns.tolist())

['id', 'grade', 'finish', 'form', 'thickness_min', 'thickness_max', 'width_min', 'width_max', 'weight_min', 'weight_max', 'Grade/Material', 'Standards', 'Carbon (C)', 'Manganese (Mn)', 'Silicon (Si)', 'Sulfur (S)', 'Phosphorus (P)', 'Aluminum (Al)', 'Tensile strength (Rm)', 'Yield strength (Re or Rp0.2)', 'Elongation (A%)', 'Source_Pages', 'Application', 'Category', 'grade_match_missing', 'Carbon_clean', 'Carbon_C_clean', 'Manganese_Mn_clean', 'Silicon_Si_clean', 'Phosphorus_P_clean', 'thickness_mid', 'width_mid']


In [86]:
df_joined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159 entries, 0 to 1158
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            1159 non-null   object 
 1   grade                         1100 non-null   object 
 2   finish                        745 non-null    object 
 3   form                          1072 non-null   object 
 4   thickness_min                 959 non-null    float64
 5   thickness_max                 961 non-null    float64
 6   width_min                     633 non-null    float64
 7   width_max                     787 non-null    float64
 8   weight_min                    465 non-null    float64
 9   weight_max                    760 non-null    float64
 10  Grade/Material                1100 non-null   object 
 11  Standards                     1159 non-null   object 
 12  Carbon (C)                    1100 non-null   object 
 13  Man

In [87]:
def parse_range(value):
    import re
    if pd.isna(value):
        return (np.nan, np.nan, np.nan)
    value = str(value).strip()
    match = re.match(r'^(\d+)\s*[–-]\s*(\d+)$', value)
    if match:
        low, high = float(match.group(1)), float(match.group(2))
        mid = (low + high) / 2
        return (low, high, mid)
    match = re.match(r'^[≤≥<>~]*\s*(\d+)$', value)
    if match:
        val = float(match.group(1))
        return (val, val, val)
    return (np.nan, np.nan, np.nan)

# Apply to tensile & yield strength
df_joined[['tensile_min', 'tensile_max', 'tensile_mid']] = df_joined['Tensile strength (Rm)'].apply(parse_range).apply(pd.Series)
df_joined[['yield_min', 'yield_max', 'yield_mid']] = df_joined['Yield strength (Re or Rp0.2)'].apply(parse_range).apply(pd.Series)

In [88]:
# Similarity between RFQ and ref strength midpoints
df_joined["yield_similarity"] = 1 - abs(df_joined["yield_mid"] - df_joined["thickness_mid"]) / df_joined["yield_mid"].max()
df_joined["tensile_similarity"] = 1 - abs(df_joined["tensile_mid"] - df_joined["width_mid"]) / df_joined["tensile_mid"].max()

In [89]:
df_joined["tensile_strength_min"] = df_joined["tensile_mid"]  # just for testing

In [90]:
similarity_features = [
    "iou_thickness", "iou_width", 
    "match_form", "match_finish", 
    "yield_similarity", "tensile_similarity"
]

In [None]:
# Features engineerd for similarity
similarity_features = [
    "iou_thickness",       # overlap in thickness
    "iou_width",           # overlap in width
    "match_form",          # 1 if form matches
    "match_finish",        # 1 if finish matches
    "yield_similarity",    # numeric comparison
    "tensile_similarity"   # numeric comparison
]

# Define corresponding weights (sum = 1.0)
weights = {
    "iou_thickness": 0.25,
    "iou_width": 0.25,
    "match_form": 0.15,
    "match_finish": 0.15,
    "yield_similarity": 0.10,
    "tensile_similarity": 0.10
}

In [None]:
def compute_iou(min1, max1, min2, max2):
    if pd.isna(min1) or pd.isna(max1) or pd.isna(min2) or pd.isna(max2):
        return 0.0
    inter = max(0, min(max1, max2) - max(min1, min2))
    union = max(max1, max2) - min(min1, min2)
    return inter / union if union != 0 else 0.0

# Simulate fixed values for reference
df_joined["ref_thickness_min"] = df_joined["thickness_mid"]
df_joined["ref_thickness_max"] = df_joined["thickness_mid"]

df_joined["ref_width_min"] = df_joined["width_mid"]
df_joined["ref_width_max"] = df_joined["width_mid"]

# Compute IoU
df_joined["iou_thickness"] = df_joined.apply(
    lambda row: compute_iou(row["thickness_min"], row["thickness_max"], row["ref_thickness_min"], row["ref_thickness_max"]),
    axis=1
)

df_joined["iou_width"] = df_joined.apply(
    lambda row: compute_iou(row["width_min"], row["width_max"], row["ref_width_min"], row["ref_width_max"]),
    axis=1
)

In [93]:
# Safe string comparison (ignores case & handles NaN)
df_joined["match_form"] = (
    df_joined["form"].astype(str).str.strip().str.lower() ==
    df_joined["Category"].astype(str).str.strip().str.lower()
).astype(int)

df_joined["match_finish"] = (
    df_joined["finish"].astype(str).str.strip().str.lower() ==
    df_joined["finish"].astype(str).str.strip().str.lower()
).astype(int)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Weighted feature matrix
X = df_joined[similarity_features].fillna(0).copy()
for col in similarity_features:
    X[col] *= weights[col]

# Cosine similarity
X_matrix = X.values
rfq_ids = df_joined["id"].values

similarity_matrix = cosine_similarity(X_matrix)
np.fill_diagonal(similarity_matrix, 0)

# Get top-3
top3_results = []
for i, rfq_id in enumerate(rfq_ids):
    sims = similarity_matrix[i]
    top_indices = sims.argsort()[-3:][::-1]
    for j in top_indices:
        top3_results.append({
            "rfq_id": rfq_id,
            "match_id": rfq_ids[j],
            "similarity_score": round(sims[j], 4)
        })

In [96]:
top3_df

Unnamed: 0,rfq_id,match_id,similarity_score
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,26ea35b5-2ccf-4fdc-884f-50a7c3a000b8,1.0
1,8aff426d-b8c0-43aa-ad26-835ef4de6129,90b02a62-3da9-4648-948c-52627d086a50,1.0
2,8aff426d-b8c0-43aa-ad26-835ef4de6129,d3db6f2f-6218-4612-8a63-c26d4ac69658,1.0


In [95]:
# Save
top3_df = pd.DataFrame(top3_results).head(3)
top3_df.to_csv("../../output/top3.csv", index=False)
print("✅ top3.csv saved!")

✅ top3.csv saved!
