In [18]:
import pandas as pd
import numpy as np
import os

In [19]:
raw_data_path = "../data/raw/ten_fabrics_metadata.csv"  # Your CSV generated earlier
cleaned_data_path = "../data/processed/cleaned_fabrics_data.csv"

In [20]:
print("📥 Loading dataset from:", raw_data_path)

try:
    df = pd.read_csv(raw_data_path)
    print(f"✅ Loaded {len(df)} rows and {len(df.columns)} columns\n")
except FileNotFoundError:
    print("❌ File not found! Please check the file path.")
    raise

📥 Loading dataset from: ../data/raw/ten_fabrics_metadata.csv
✅ Loaded 2833 rows and 4 columns



In [21]:
print("🔍 Sample data:")
display(df.head())

🔍 Sample data:


Unnamed: 0,fabric_id,image_name,defect_label,file_path
0,1,001-001.png,256\n256\n16\n16\n0\n0 0 0 0 0 0 0 0 0 0 0 0 0...,data/raw/TFD Textile Dataset/001/001-001.png
1,1,001-002.png,256\n256\n16\n16\n0\n0 0 0 0 0 0 0 0 0 0 0 0 0...,data/raw/TFD Textile Dataset/001/001-002.png
2,1,001-003.png,256\n256\n16\n16\n1\n0 0 0 0 0 0 0 0 0 0 0 0 0...,data/raw/TFD Textile Dataset/001/001-003.png
3,1,001-004.png,256\n256\n16\n16\n0\n0 0 0 0 0 0 0 0 0 0 0 0 0...,data/raw/TFD Textile Dataset/001/001-004.png
4,1,001-005.png,256\n256\n16\n16\n0\n0 0 0 0 0 0 0 0 0 0 0 0 0...,data/raw/TFD Textile Dataset/001/001-005.png


In [22]:
if 'defect_label' in df.columns:
    df['defect_label'] = df['defect_label'].astype(str).str.replace('\n', ' ').str.strip()
else:
    print("⚠️ 'defect_label' column not found! Skipping cleanup for that column.")

In [23]:
def extract_defect_code(label):
    parts = str(label).split()
    try:
        # In your data, the 5th element (index 4) seems to represent the defect code
        return int(parts[4])
    except:
        return np.nan

if 'defect_label' in df.columns:
    df['defect_code'] = df['defect_label'].apply(extract_defect_code)
else:
    df['defect_code'] = np.nan

In [24]:
df = df.dropna(subset=['defect_code'])
df['defect_code'] = df['defect_code'].astype(int)


In [25]:
df = df.drop_duplicates()


In [26]:
def check_image_exists(path):
    # Handle both relative and absolute paths
    return os.path.exists(os.path.join("..", path))

if 'file_path' in df.columns:
    df['image_exists'] = df['file_path'].apply(check_image_exists)
    df = df[df['image_exists'] == True]
else:
    print("⚠️ 'file_path' column not found! Skipping image existence check.")

In [27]:
os.makedirs(os.path.dirname(cleaned_data_path), exist_ok=True)
df.to_csv(cleaned_data_path, index=False)

print("\n💾 Cleaned dataset saved successfully!")
print(f"📁 File: {cleaned_data_path}")
print(f"🧾 Total Clean Records: {len(df)}")


💾 Cleaned dataset saved successfully!
📁 File: ../data/processed/cleaned_fabrics_data.csv
🧾 Total Clean Records: 2833


In [28]:
if 'defect_code' in df.columns:
    print("\nSummary of Defect Labels:")
    print(df['defect_code'].value_counts())

print("\nColumns in final dataset:")
print(df.columns.tolist())


Summary of Defect Labels:
defect_code
0      2118
2       113
1       110
16       96
32       40
       ... 
112       1
127       1
72        1
83        1
149       1
Name: count, Length: 84, dtype: int64

Columns in final dataset:
['fabric_id', 'image_name', 'defect_label', 'file_path', 'defect_code', 'image_exists']
