#Objective
This notebook focuses on cleaning and uniting cpu, and gpu names into shared groups, example: i3, i4, amd ... ect

In [3]:
import pandas as pd

df = pd.read_csv("full_merged_dataset.csv")

# Define file paths
FILE_PATH = 'full_merged_dataset_copy.csv'
OUTPUT_PATH = 'cleaned_dataset_leena.csv'

#1- Load Data

we first load our data:

In [4]:
df = pd.read_csv("full_merged_dataset.csv")

print(df.head())

        PRICE LAPTOP_CONDITION    LAPTOP_BRAND    LAPTOP_MODEL  \
0  75000000.0          BON TAT  NeedToBeFilled         IDEAPAD   
1  33500000.0    JAMAIS UTILIS  NeedToBeFilled            AERO   
2  17000000.0   NeedToBeFilled  NeedToBeFilled         STEALTH   
3  12000000.0   NeedToBeFilled  NeedToBeFilled             ROG   
4  11000000.0          BON TAT  NeedToBeFilled  NeedToBeFilled   

             DEDICATED_GPU     GPU_GENERAL  GPU_INTEGRATED  \
0           NeedToBeFilled  NeedToBeFilled  NeedToBeFilled   
1  NVIDIA GEFORCE RTX 3060  NeedToBeFilled  NeedToBeFilled   
2  NVIDIA GEFORCE GTX 1060  NeedToBeFilled  NeedToBeFilled   
3  NVIDIA GEFORCE RTX 1650  NeedToBeFilled  NeedToBeFilled   
4        AMD RADEON RX 580  NeedToBeFilled  NeedToBeFilled   

                             CPU RAM_SIZE        RAM_TYPE        SSD_SIZE  \
0             INTEL CORE I5 750S      4GB  NeedToBeFilled           128GB   
1  11TH GEN INTEL CORE I7 11800H     16GB  NeedToBeFilled             1TB   

#2- Clean CPU names

We will first start with the CPU, for which we will group them to the following: intel core, i3, i4, ..., i9 , amd, and ryzen:

In [7]:
df["CPU"] = df["CPU"].fillna("none")
df["CPU"].unique()


df["CPU"] = (
    df["CPU"]
      .str.lower()
      .str.replace(r"\s+", " ", regex=True)
      .str.strip()
)

for i in range(1, 10):  # i1 through i9
    df["CPU"] = df["CPU"].str.replace(
        rf".*i{i}.*",  # notice the `f` after `r` ‚Üí raw f-string
        f"i{i}",
        case=False,
        regex=True
    )

for i in range(1, 3):  # i1 through i9
    df["CPU"] = df["CPU"].str.replace(
        rf".*m{i}.*",  # notice the `f` after `r` ‚Üí raw f-string
        f"apple",
        case=False,
        regex=True
    )


df["CPU"] = df["CPU"].str.replace(
    r".*amd.*",
    "AMD",
    case=False,
    regex=True
)

df["CPU"] = df["CPU"].str.replace(
    r".*ryzen.*",
    "RYZEN",
    case=False,
    regex=True
)

df["CPU"] = df["CPU"].str.replace(
    r".*core.*",
    "intel core",
    case=False,
    regex=True
)

counts = df["CPU"].value_counts()

df["CPU"] = df["CPU"].where(
    df["CPU"].map(counts) > 100,
    "other"
)

df = df[df["CPU"].str.lower() != "none"]

df["CPU"] = (
    df["CPU"]
      .str.lower()
      .str.replace(r"\s+", " ", regex=True)
      .str.strip()
)

print(df["CPU"].unique())

df["CPU"].value_counts()

['i5' 'i7' 'amd' ... 'intel 15 11300' 'cpu 2.20 ghz 2.20 ghz' 'ddr2']
['i5' 'i7' 'amd' 'i3' 'i9' 'apple' 'intel celeron' 'other' 'intel core'
 'apple m4' 'apple m3' 'intel celeron n4020' 'ryzen']


Unnamed: 0_level_0,count
CPU,Unnamed: 1_level_1
i5,19545
i7,12267
amd,4877
other,2897
i3,2885
apple,1876
intel core,1744
i9,1321
ryzen,527
apple m3,325


#3- Cleaning the GPU_INTEGRATED Column

First we are renaming the values to not have only english characters:

In [13]:
df["GPU_GENERAL"] = df["GPU_GENERAL"].fillna("Unknown")
df["GPU_GENERAL"].head()

translation_map = {
    "Integr√©e": "Integrated",
    "D√©di√©e": "Dedicated",
    "Integr√©e + D√©di√©e": "Integrated + Dedicated",
    'Int√©gr√©e' : "Integrated"
}

df["GPU_GENERAL"] = df["GPU_GENERAL"].replace(translation_map)

df["GPU_GENERAL"].unique()

array(['NeedToBeFilled', 'Integrated + Dedicated', 'Integrated',
       'Dedicated'], dtype=object)

#4- Cleaning the DEDICATED_GPU Column

Here we do the same thing we did for the CPUs, we group them to 4 major bins: Nvidia, AMD, apple, other, none

In [28]:
df["DEDICATED_GPU"].unique()

df["DEDICATED_GPU"] = (
    df["DEDICATED_GPU"]
      .str.lower()
      .str.replace(r"\s+", " ", regex=True)
      .str.strip()
)

df["DEDICATED_GPU"] = df["DEDICATED_GPU"].str.replace( #getting renamed to apple for some reason
    r".*NeedToBeFilled.*",
    "None",
    case=False,
    regex=True
)

df["DEDICATED_GPU"] = df["DEDICATED_GPU"].str.replace(
    r".*nvidia.*",
    "Nvidia",
    case=False,
    regex=True
)

df["DEDICATED_GPU"] = df["DEDICATED_GPU"].str.replace(
    r".*amd.*",
    "AMD",
    case=False,
    regex=True
)

df["DEDICATED_GPU"] = df["DEDICATED_GPU"].str.replace(
    r".*apple.*",
    "Apple",
    case=False,
    regex=True
)



counts = df["DEDICATED_GPU"].value_counts()

df["DEDICATED_GPU"] = df["DEDICATED_GPU"].where(
    df["DEDICATED_GPU"].map(counts) > 50,
    "other"
)

df["DEDICATED_GPU"].unique()

df["DEDICATED_GPU"].value_counts()



Unnamed: 0_level_0,count
DEDICATED_GPU,Unnamed: 1_level_1
Apple,38509
Nvidia,8233
other,1124
AMD,810


#5- Cleaning GPU_INTEGRATED Column
Same as the previous, however this one we only focus on intel, amd and apple since they are the dominant force here:

In [20]:
df["GPU_INTEGRATED"].unique()

df["GPU_INTEGRATED"] = (
    df["GPU_INTEGRATED"]
      .str.lower()
      .str.replace(r"\s+", " ", regex=True)
      .str.strip()
)

df["GPU_INTEGRATED"] = df["GPU_INTEGRATED"].str.replace(
    r".*intel.*",
    "Intel",
    case=False,
    regex=True
)

df["GPU_INTEGRATED"] = df["GPU_INTEGRATED"].str.replace(
    r".*amd.*",
    "AMD",
    case=False,
    regex=True
)

df["GPU_INTEGRATED"] = df["GPU_INTEGRATED"].str.replace(
    r".*apple.*",
    "Apple",
    case=False,
    regex=True
)

df["GPU_INTEGRATED"] = df["GPU_INTEGRATED"].str.replace(
    r".*NeedToBeFilled.*",
    "None",
    case=False,
    regex=True
)


counts = df["GPU_INTEGRATED"].value_counts()

df["GPU_INTEGRATED"] = df["GPU_INTEGRATED"].where(
    df["GPU_INTEGRATED"].map(counts) > 200,
    "other"
)

df["GPU_INTEGRATED"].unique()

df["GPU_INTEGRATED"].value_counts()



Unnamed: 0_level_0,count
GPU_INTEGRATED,Unnamed: 1_level_1
none,37621
other,5074
Intel,4607
AMD,1145
Apple,229


#6- Resolving Contradictions:
This section is to find/resolve Contradicions between the CPU/GPU_INTEGRATED/DEDICATED_GPU and GPU_INTEGRATED:

In [24]:
contradictions = df[
    ((df["GPU_INTEGRATED"] != "None") & (df["GPU_GENERAL"] == "Dedicated")) |
    ((df["GPU_INTEGRATED"] == "None") & (df["GPU_GENERAL"] == "Integrated")) |
    ((df["DEDICATED_GPU"] == "None") & df["GPU_GENERAL"].str.contains("Dedicated")) |
    ((df["DEDICATED_GPU"] != "None") & (df["GPU_GENERAL"] == "Integrated"))
]

print(contradictions)

# not working as intended

          PRICE     LAPTOP_CONDITION    LAPTOP_BRAND    LAPTOP_MODEL  \
16408  115000.0            Etat neuf            DELL  NeedToBeFilled   
16410      -1.0            Etat neuf              Hp  NeedToBeFilled   
16411  117000.0             Bon √©tat  NeedToBeFilled  NeedToBeFilled   
16412   59000.0           Etat moyen            DELL  NeedToBeFilled   
16414   70000.0            Etat neuf           Intel  NeedToBeFilled   
...         ...                  ...             ...             ...   
37217       1.0             Bon √©tat  NeedToBeFilled  NeedToBeFilled   
37219      -1.0             Bon √©tat     Razer Blade  NeedToBeFilled   
37220  171500.0  Neuf jamais utilis√©              HP  NeedToBeFilled   
37221   30000.0             Bon √©tat         Applesüçé  NeedToBeFilled   
37223  249000.0            Etat neuf           APPLE  NeedToBeFilled   

      DEDICATED_GPU GPU_GENERAL GPU_INTEGRATED    CPU RAM_SIZE  \
16408         Apple  Integrated          Intel     i5    16 G

#7- Export Output:
Exporting output:

In [29]:
df.sample(20)

Unnamed: 0,PRICE,LAPTOP_CONDITION,LAPTOP_BRAND,LAPTOP_MODEL,DEDICATED_GPU,GPU_GENERAL,GPU_INTEGRATED,CPU,RAM_SIZE,RAM_TYPE,SSD_SIZE,HDD_SIZE,STORAGE_SIZE,STORAGE_TYPE,SCREEN_SIZE,SCREEN_FREQUENCY,SCREEN_RESOLUTION,CITY,POST_YEAR,POST_MONTH
5313,128000.0,JAMAIS UTILIS,NeedToBeFilled,GALAXY,Apple,NeedToBeFilled,none,i5,8GB,DDR4X,256GB,NeedToBeFilled,NeedToBeFilled,NeedToBeFilled,13.3,NeedToBeFilled,1920x1080,ALGER CENTRE,2024,9
32880,135000.0,Neuf jamais utilis√©,Hp,NeedToBeFilled,Apple,Integrated,AMD,amd,16 GO,NeedToBeFilled,NeedToBeFilled,NeedToBeFilled,1000GO,SSD,"17.3""",NeedToBeFilled,NeedToBeFilled,Boumerdes,-1,-1
38963,129000.0,Never Used (New),LG,NeedToBeFilled,Apple,NeedToBeFilled,none,i7,32GB,NeedToBeFilled,1000GB,NeedToBeFilled,NeedToBeFilled,NeedToBeFilled,17.0,NeedToBeFilled,QHD,NeedToBeFilled,2025,7
14256,41000.0,BON TAT,NeedToBeFilled,SPIN,Apple,NeedToBeFilled,none,i3,8GB,NeedToBeFilled,128GB,NeedToBeFilled,NeedToBeFilled,NeedToBeFilled,14.0,NeedToBeFilled,NeedToBeFilled,OULED FAYET,2025,6
20622,52000.0,Etat neuf,HP PROBOOK,NeedToBeFilled,Apple,Integrated,Intel,i5,8 GO,NeedToBeFilled,NeedToBeFilled,NeedToBeFilled,128GO,SSD,"12""",NeedToBeFilled,NeedToBeFilled,Alger centre,-1,-1
8961,85000.0,NeedToBeFilled,NeedToBeFilled,YOGA,Apple,NeedToBeFilled,none,i5,16GB,NeedToBeFilled,512GB,NeedToBeFilled,NeedToBeFilled,NeedToBeFilled,14.0,NeedToBeFilled,NeedToBeFilled,ANNABA,2025,6
37807,460000.0,Never Used (New),APPLE,MACBOOK PRO,Apple,NeedToBeFilled,none,other,32GB,DDR5X,1000GB,NeedToBeFilled,NeedToBeFilled,NeedToBeFilled,14.0,NeedToBeFilled,NeedToBeFilled,NeedToBeFilled,2024,10
37786,490000.0,Never Used (New),APPLE,MACBOOK PRO,Apple,NeedToBeFilled,none,other,24GB,DDR5,512GB,NeedToBeFilled,NeedToBeFilled,NeedToBeFilled,14.0,NeedToBeFilled,3K,NeedToBeFilled,2025,7
20471,279000.0,Etat neuf,HP ZBOOK POWER G10,NeedToBeFilled,Nvidia,Integrated + Dedicated,other,i7,32 GO,NeedToBeFilled,NeedToBeFilled,NeedToBeFilled,1024GO,SSD,"16""",NeedToBeFilled,NeedToBeFilled,Bab ezzouar,-1,-1
39547,134000.0,Never Used (New),DELL,LATITUDE,Apple,NeedToBeFilled,none,i7,32GB,DDR5,512GB,NeedToBeFilled,NeedToBeFilled,NeedToBeFilled,14.0,NeedToBeFilled,FHD,NeedToBeFilled,2025,7
