In [10]:
!pip install mlxtend




In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from mlxtend.preprocessing import TransactionEncoder
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


In [14]:
cpu = pd.read_csv("cpu.csv")
gpu = pd.read_csv("gpu.csv")
ram = pd.read_csv("ram.csv")
mobo = pd.read_csv("motherboard.csv")
psu = pd.read_csv("psu.csv")
storage = pd.read_csv("storage.csv")


cell 1

In [16]:
cpu["component_type"] = "CPU"
gpu["component_type"] = "GPU"
ram["component_type"] = "RAM"
mobo["component_type"] = "Motherboard"
psu["component_type"] = "PSU"
storage["component_type"] = "Storage"


cell 2

In [None]:
print("CPU columns:", cpu.columns.tolist())
print("GPU columns:", gpu.columns.tolist())
print("RAM columns:", ram.columns.tolist())
print("Motherboard columns:", mobo.columns.tolist())
print("PSU columns:", psu.columns.tolist())
print("Storage columns:", storage.columns.tolist())


**Column Standardization**

The original dataset consists of multiple CSV files representing different PC components. Each file uses slightly different column naming conventions. To ensure consistency and enable unified processing, column names are standardized across all datasets. This step improves readability, avoids ambiguity, and allows common attributes (e.g., brand, socket, wattage) to be analyzed uniformly.

In [19]:
def standardize_columns(df):
    df = df.rename(columns={
        "Name": "name",
        "Price": "price",
        "Producer": "brand",
        "Socket": "socket",
        "Chipset": "chipset",
        "Ram Type": "ram_type",
        "Memory Type": "memory_type",
        "Form Factor": "form_factor",
        "Size": "capacity",
        "Clock": "clock",
        "Boost Clock": "boost_clock",
        "Base Clock": "base_clock",
        "Watt": "wattage",
        "Efficiency Rating": "efficiency",
        "Vram": "vram",
        "TDP": "tdp",
        "Cores": "cores",
        "Threads": "threads"
    })
    return df


**Applying Standardized Naming**

The column standardization function is applied to all component datasets. This ensures that similar attributes across different hardware types use the same naming scheme, which is necessary before merging the datasets into a single unified table.

In [20]:
cpu = standardize_columns(cpu)
gpu = standardize_columns(gpu)
ram = standardize_columns(ram)
mobo = standardize_columns(mobo)
psu = standardize_columns(psu)
storage = standardize_columns(storage)


**Dataset Unification**

All standardized component datasets are merged into a single dataframe. Each row represents one hardware item, while a new component_type attribute identifies the category of the component. Missing values are expected at this stage because different components naturally have different specifications.

In [21]:
hardware_df = pd.concat(
    [cpu, gpu, ram, mobo, psu, storage],
    ignore_index=True,
    sort=False
)

hardware_df.head()


Unnamed: 0,name,price,brand,MPN,EAN,UPC,base_clock,Turbo Clock,Unlocked Multiplier,cores,threads,tdp,socket,Integrated GPU,Product Page,component_type,Length,Slots,8-Pin Connectors,6-Pin Connectors,HDMI,DisplayPort,DVI,VGA,boost_clock,vram,Memory Clock,ram_type,capacity,clock,Timings,Sticks,chipset,Unlocked,form_factor,memory_type,Memory Capacity,RAM Slots,SATA,Display Port,WiFi,Integrated Graphics,wattage,efficiency,Protocol,NAND,Controller
0,AMD Ryzen 5 5600X,$158.86 USD,AMD,100-100000065BOX,730143312042,,3.7 GHz,4.6 GHz,True,6.0,12.0,65 W,AM4,False,https://www.amazon.ca/dp/B08166SLDF?tag=pckomb...,CPU,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AMD Athlon 3000G,$53.22 USD,AMD,YD3000C6FHBOX,730143311731,,3.5 GHz,,True,2.0,4.0,35 W,AM4,Radeon RX Vega 3,https://www.amazon.ca/dp/B0815JGFQ8?tag=pckomb...,CPU,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,AMD Ryzen 3 3300X,$150.09 USD,AMD,100-100000159BOX,730143312172,,3.8 GHz,4.3 GHz,True,4.0,8.0,65 W,AM4,False,https://www.amazon.ca/dp/B0876YS2T4?tag=pckomb...,CPU,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,AMD Ryzen 5 5500,$87.04 USD,AMD,100-100000457BOX,730143314121,,3.6 GHz,4.2 GHz,True,6.0,12.0,65 W,AM4,False,https://www.amazon.ca/dp/B09VCJ171S?tag=pckomb...,CPU,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,AMD Ryzen 5 5600,$133.46 USD,AMD,100-100000927BOX,730143314190,,3.5 GHz,4.4 GHz,True,6.0,12.0,65 W,AM4,False,https://www.amazon.ca/dp/B09VCHR1VH?tag=pckomb...,CPU,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Initial Dataset Assessment (Before Preprocessing)

This step provides an overview of the unified dataset, including:

*   Total number of records
*   Total number of attributes
*   Proportion of missing values

This analysis highlights the sparsity inherent in multi-component hardware data and motivates the preprocessing steps that follow.

In [22]:
print("Total records:", hardware_df.shape[0])
print("Total attributes:", hardware_df.shape[1])

hardware_df.isnull().mean().sort_values(ascending=False).head(10)


Total records: 6357
Total attributes: 47


Unnamed: 0,0
Controller,0.94919
Turbo Clock,0.948561
Unlocked Multiplier,0.948403
threads,0.948403
cores,0.948403
Integrated GPU,0.948403
base_clock,0.948403
NAND,0.945415
Protocol,0.928268
efficiency,0.860941


**Feature Selection for Association Rule Mining**

Only attributes relevant to compatibility and association analysis are selected. Non-informative fields such as product codes, URLs, and identifiers are excluded. This reduces dimensionality, improves algorithm efficiency, and focuses rule generation on meaningful hardware relationships.

In [23]:
arm_features = [
    "component_type",
    "brand",
    "socket",
    "chipset",
    "ram_type",
    "memory_type",
    "form_factor",
    "capacity",
    "wattage",
    "vram",
    "efficiency"
]

arm_df = hardware_df[arm_features].copy()
arm_df.head()


Unnamed: 0,component_type,brand,socket,chipset,ram_type,memory_type,form_factor,capacity,wattage,vram,efficiency
0,CPU,AMD,AM4,,,,,,,,
1,CPU,AMD,AM4,,,,,,,,
2,CPU,AMD,AM4,,,,,,,,
3,CPU,AMD,AM4,,,,,,,,
4,CPU,AMD,AM4,,,,,,,,


**Handling Missing Values**

Since association rule mining requires categorical completeness, missing values are replaced with a neutral placeholder ("Unknown"). This prevents the loss of transactions while preserving the semantic meaning that a specification is not applicable or unavailable for a given component.

In [24]:
arm_df = arm_df.fillna("Unknown")


**Preprocessing Impact Evaluation**

This step compares missing value statistics before and after preprocessing. It quantitatively demonstrates the effectiveness of the data cleaning process and provides transparency in data transformation, which is essential for reproducible data mining experiments.

In [25]:
print("Before preprocessing:")
print("Missing values (%):")
print(hardware_df.isnull().mean().mean() * 100)

print("\nAfter preprocessing:")
print("Missing values (%):")
print(arm_df.isnull().mean().mean() * 100)


Before preprocessing:
Missing values (%):
69.7241104629174

After preprocessing:
Missing values (%):
0.0


**Transaction Construction**

Each hardware item is converted into a transaction by representing attribute-value pairs as individual items. This transformation allows the originally non-transactional dataset to be used with association rule mining algorithms such as Apriori.


In [26]:
transactions = arm_df.apply(
    lambda row: [f"{col}={row[col]}" for col in arm_df.columns],
    axis=1
).tolist()

transactions[:2]


[['component_type=CPU',
  'brand=AMD',
  'socket=AM4',
  'chipset=Unknown',
  'ram_type=Unknown',
  'memory_type=Unknown',
  'form_factor=Unknown',
  'capacity=Unknown',
  'wattage=Unknown',
  'vram=Unknown',
  'efficiency=Unknown'],
 ['component_type=CPU',
  'brand=AMD',
  'socket=AM4',
  'chipset=Unknown',
  'ram_type=Unknown',
  'memory_type=Unknown',
  'form_factor=Unknown',
  'capacity=Unknown',
  'wattage=Unknown',
  'vram=Unknown',
  'efficiency=Unknown']]