In [539]:
import zipfile
import pandas as pd  # oppure: import csv

zip_path     = 'dataset_laptop.zip'
csv_inside   = 'laptop.csv'   # percorso interno al .zip

with zipfile.ZipFile(zip_path, 'r') as z:
    with z.open(csv_inside) as f:
        # Se vuoi usare pandas:
        df = pd.read_csv(f)
df.head()

Unnamed: 0.1,Unnamed: 0,Brand,Name,Price,Processor_Name,Processor_Brand,RAM_Expandable,RAM,RAM_TYPE,Ghz,Display_type,Display,GPU,GPU_Brand,SSD,HDD,Adapter,Battery_Life
0,0,HP,HP Chromebook 11A-NA0002MU (2E4N0PA) Laptop (11.6 Inch | MediaTek Octa Core | 4 GB | Google Chrome |)::585119::computer::laptops,22990,MediaTek Octa-core,MediaTek,Not Expandable,4 GB,DDR4 RAM,2.0 Ghz Processor,LED,11.6,Integrated Graphics,MediaTek,64 GB SSD Storage,No HDD,45,Upto 12 Hrs Battery Life
1,1,Lenovo,Lenovo Ideapad Slim 3 (82KU017KIN) Laptop (15.6 Inch | AMD Hexa Core Ryzen 5 | 8 GB | Windows 11 | 512 GB SSD)::594497::computer::laptops,36289,AMD Hexa-Core Ryzen 5,AMD,12 GB Expandable,8 GB,DDR4 RAM,4.0 Ghz Processor,LCD,15.6,Radeon,AMD,512 GB SSD Storage,No HDD,65,Upto 11 Hrs Battery Life
2,3,Dell,Dell G15-5520 (D560822WIN9B) Laptop (15.6 Inch | Core i5 12th Gen | 16 GB | Windows 11 | 512 GB SSD)::595299::computer::laptops,78500,Intel Core i5 (12th Gen),Intel,32 GB Expandable,16 GB,DDR5 RAM,3.3 Ghz Processor,LCD,15.6,"GeForce RTX 3050 GPU, 4 GB",NVIDIA,512 GB SSD Storage,No HDD,56,Upto 10 Hrs Battery Life
3,4,HP,HP 15s-fy5007TU (91R03PA) Laptop (15.6 Inch | Core i5 12th Gen | 8 GB | Windows 11 | 512 GB SSD)::616095::computer::laptops,55490,Intel Core i5 (12th Gen),Intel,8 GB Expandable,8 GB,DDR4 RAM,4.2 Ghz Processor,LCD,15.6,Iris Xe,Intel,512 GB SSD Storage,No HDD,no,Upto 7.30 Hrs Battery Life
4,6,Infinix,Infinix Inbook Y2 Plus XL29 Laptop (15.6 Inch | Core i3 11th Gen | 8 GB | Windows 11 | 512 GB SSD)::615700::computer::laptops,21990,Intel Core i3 (11th Gen),Intel,Not Expandable,8 GB LP,LPDDR4X RAM,1.7 Ghz Processor,LCD,15.6,UHD,Intel,512 GB SSD Storage,No HDD,45,Upto 8 Hrs Battery Life


individuazione NaN

In [540]:
print(df.isna().sum())

Unnamed: 0           0
Brand                0
Name                 0
Price                0
Processor_Name       0
Processor_Brand      0
RAM_Expandable       0
RAM                  0
RAM_TYPE             0
Ghz                  0
Display_type         0
Display              0
GPU                  8
GPU_Brand            4
SSD                  0
HDD                  0
Adapter              0
Battery_Life       418
dtype: int64


rimozione NaN per la colonna GPU

In [541]:
df['GPU'].value_counts()

GPU
UHD                                             651
Iris Xe                                         573
Radeon                                          313
GeForce RTX 3050 GPU, 4 GB                      203
Integrated                                      156
GeForce GTX 1650 GPU, 4 GB                      144
UHD 620                                         120
GeForce RTX 4050 GPU, 6 GB                       93
GeForce RTX 3060 GPU, 6 GB                       90
HD 620                                           85
GeForce RTX 2050 GPU, 4 GB                       82
GeForce RTX 4060 GPU, 8 GB                       71
Arc                                              54
HD 520                                           54
Radeon Vega 8                                    54
GeForce RTX 3050 Ti GPU, 4 GB                    50
Geforce GTX 1650 GPU, 4 GB                       37
UHD 600                                          36
GeForce RTX 3050 GPU, 6 GB                       36
GeForce 

tentativo di standardizzazione dei nomi GPU dato che molte erano le stesse GPU ma scritte in modo diverso

In [542]:
import re
import pandas as pd

def standardizza_gpu(raw: str) -> str:
    s = str(raw).lower().strip()
    # 1. rimuovo parole chiave ridondanti
    s = re.sub(r'\bgpu\b', '', s)
    s = re.sub(r'\bgraphics?\b', '', s)
    s = re.sub(r'\bgb\b', '', s)
    # 2. uniformo varianti max-q / max q
    s = re.sub(r'max[\s-]?q', 'max-q', s)
    # 3. pulisco punteggiatura
    s = re.sub(r'[,/()]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    # 4. estraggo brand+modello
    #   a) NVIDIA GeForce (RTX/GTX/MX)
    m = re.search(r'(geforce|gtx|rtx|mx)\s*([0-9]{3,4})', s)
    if m:
        return f"GeForce {m.group(2)}"
    #   b) AMD Radeon RX/Vega
    m = re.search(r'(radeon)\s*(rx\s*[0-9]{3,4}|vega\s*\d+)', s)
    if m:
        modello = m.group(2).replace(' ', '').upper()
        return f"Radeon {modello}"
    #   c) Intel Iris / UHD / HD
    m = re.search(r'(iris xe|max-q|iris plus|uhd|hd)\s*(\d{3})?', s)
    if m:
        name = m.group(1).title()
        num  = m.group(2) or ''
        return f"{name} {num}".strip()
    #   d) Apple M1/M2
    if 'm1' in s or 'm2' in s:
        return s.upper().replace(' ', '')
    #   e) Adreno, Arc, Quadro, T-series
    m = re.search(r'(adreno|arc|quadro|t\d{3,4})', s)
    if m:
        return m.group(1).title()
    # 5. integrated generic
    if 'integrated' in s:
        return 'Integrated'
    # 6. fallback: ripropongo il raw originale (o 'Other')
    return raw

# Applico la funzione al DataFrame
df['gpu_standard'] = df['GPU'].apply(standardizza_gpu)

# Controllo le nuove categorie
print(df['gpu_standard'].value_counts())

gpu_standard
Uhd                            674
Iris Xe                        589
Radeon                         313
GeForce 3050                   302
GeForce 1650                   226
Integrated                     160
Uhd 620                        122
GeForce 4050                    94
GeForce 3060                    93
Hd 620                          86
GeForce 2050                    83
GeForce 4060                    74
GeForce 1050                    67
Arc                             59
Hd 520                          57
Radeon VEGA8                    55
GeForce 3070                    42
Hd                              38
Uhd 600                         37
GeForce 4070                    35
GeForce 150                     35
GeForce 250                     26
GeForce 940                     26
Radeon R5                       21
GeForce 130                     21
GeForce 450                     21
Radeon R2                       20
GeForce 2060                    20
Radeon 

In [543]:
df[df['gpu_standard'].isna()].head(8)

Unnamed: 0.1,Unnamed: 0,Brand,Name,Price,Processor_Name,Processor_Brand,RAM_Expandable,RAM,RAM_TYPE,Ghz,Display_type,Display,GPU,GPU_Brand,SSD,HDD,Adapter,Battery_Life,gpu_standard
1008,1073,Samsung,Samsung Galaxy Book 4 NP750XGK-LG3IN Laptop (15.6 Inch | Core 7 Series 1 | 16 GB | Windows 11 | 512 GB SSD)::618124::computer::laptops,79999,Intel Core 7 (Series 1),Intel,Not Expandable,16 GB LP,LPDDR4X RAM,1.8 Ghz Processor,LCD,15.6,,Intel,512 GB SSD Storage,No HDD,45,45W Adapter,
1052,1118,HP,HP 15-fd1096TU (A03CJPA) Laptop (15.6 Inch | Core 5 Series 1 | 8 GB | Windows 11 | 512 GB SSD)::617370::computer::laptops,64990,Intel Core 5 (Series 1),Intel,Not Expandable,8 GB,DDR4 RAM,4.8 Ghz Processor,LCD,15.6,,Intel,512 GB SSD Storage,No HDD,65,65W Adapter,
1117,1185,Samsung,Samsung Galaxy Book 4 NP750XGK-KS2IN Laptop (15.6 Inch | Core 5 Series 1 | 16 GB | Windows 11 | 512 GB SSD)::616489::computer::laptops,72990,Intel Core 5 (Series 1),Intel,Not Expandable,16 GB LP,LPDDR4X RAM,1.4 Ghz Processor,LCD,15.6,,Intel,512 GB SSD Storage,No HDD,45,45W Adapter,
2147,2303,Samsung,Samsung Galaxy Book 4 NP750XGK-KG1IN Laptop (15.6 Inch | Core 5 Series 1 | 8 GB | Windows 11 | 512 GB SSD)::616544::computer::laptops,70990,Intel Core 5 (Series 1),Intel,Not Expandable,8 GB LP,LPDDR4X RAM,1.4 Ghz Processor,LCD,15.6,,Intel,512 GB SSD Storage,No HDD,45,45W Adapter,
2153,2309,Samsung,Samsung Galaxy Book 4 NP750XGK-KG2IN Laptop (15.6 Inch | Core 5 Series 1 | 16 GB | Windows 11 | 512 GB SSD)::616532::computer::laptops,75990,Intel Core 5 (Series 1),Intel,Not Expandable,16 GB LP,LPDDR4X RAM,1.4 Ghz Processor,LCD,15.6,,Intel,512 GB SSD Storage,No HDD,45,45W Adapter,
2621,2944,HP,HP Envy x360 14-fc0106TU (A00PQPA) Laptop (14 Inch | Core Ultra 7 | 16 GB | Windows 11 | 512 GB SSD)::616523::computer::laptops,105990,Intel Core Ultra 7,Intel,Not Expandable,16 GB LP,LPDDR5 RAM,4.8 Ghz Processor,LED,14.0,,Intel,512 GB SSD Storage,No HDD,65,65W Adapter,
2625,2948,Samsung,Samsung Galaxy Book 4 NP750XGK-KS1IN Laptop (15.6 Inch | Core 5 Series 1 | 8 GB | Windows 11 | 512 GB SSD)::616490::computer::laptops,68990,Intel Core 5 (Series 1),Intel,Not Expandable,8 GB LP,LPDDR4X RAM,1.4 Ghz Processor,LCD,15.6,,Intel,512 GB SSD Storage,No HDD,45,45W Adapter,
2655,2979,HP,HP Envy x360 14-fc0100TU (9Z835PA) Laptop (14 Inch | Core Ultra 7 | 32 GB | Windows 11 | 1 TB SSD)::616524::computer::laptops,129990,Intel Core Ultra 7,Intel,Not Expandable,32 GB LP,LPDDR5 RAM,4.8 Ghz Processor,LED,14.0,,Intel,1024 GB SSD Storage,No HDD,65,65W Adapter,


In [544]:
serie_idx = pd.Series(df[df['gpu_standard'].isna()].Name, name='Name of Laptop without GPU')
print(serie_idx)

1008    Samsung Galaxy Book 4 NP750XGK-LG3IN Laptop (15.6 Inch | Core 7 Series 1 | 16 GB | Windows 11 | 512 GB SSD)::618124::computer::laptops
1052                 HP 15-fd1096TU (A03CJPA) Laptop (15.6 Inch | Core 5 Series 1 | 8 GB | Windows 11 | 512 GB SSD)::617370::computer::laptops
1117    Samsung Galaxy Book 4 NP750XGK-KS2IN Laptop (15.6 Inch | Core 5 Series 1 | 16 GB | Windows 11 | 512 GB SSD)::616489::computer::laptops
2147     Samsung Galaxy Book 4 NP750XGK-KG1IN Laptop (15.6 Inch | Core 5 Series 1 | 8 GB | Windows 11 | 512 GB SSD)::616544::computer::laptops
2153    Samsung Galaxy Book 4 NP750XGK-KG2IN Laptop (15.6 Inch | Core 5 Series 1 | 16 GB | Windows 11 | 512 GB SSD)::616532::computer::laptops
2621           HP Envy x360 14-fc0106TU (A00PQPA) Laptop (14 Inch | Core Ultra 7 | 16 GB | Windows 11 | 512 GB SSD)::616523::computer::laptops
2625     Samsung Galaxy Book 4 NP750XGK-KS1IN Laptop (15.6 Inch | Core 5 Series 1 | 8 GB | Windows 11 | 512 GB SSD)::616490::computer::laptops

In [545]:
indici_nan_gpu = df[df['gpu_standard'].isna()].index.tolist()
print(indici_nan_gpu)

[1008, 1052, 1117, 2147, 2153, 2621, 2625, 2655]


inserimento manuale dopo ricerca su internet

In [546]:
df.at[1008,  'gpu_standard'] = 'Iris Xe'
df.at[1052,  'gpu_standard'] = 'Integrated'
df.at[1117,  'gpu_standard'] = 'Iris Xe'
df.at[2147, 'gpu_standard'] = 'Iris Xe'
df.at[2153,  'gpu_standard'] = 'Iris Xe'
df.at[2621, 'gpu_standard'] = 'Integrated'
df.at[2625,  'gpu_standard'] = 'Iris Xe'
df.at[2655, 'gpu_standard'] = 'Integrated'

rimozione NaN per la colonna GPU_Brand

In [547]:
df[df['GPU_Brand'].isna()].head()

Unnamed: 0.1,Unnamed: 0,Brand,Name,Price,Processor_Name,Processor_Brand,RAM_Expandable,RAM,RAM_TYPE,Ghz,Display_type,Display,GPU,GPU_Brand,SSD,HDD,Adapter,Battery_Life,gpu_standard
3275,3671,Lenovo,Lenovo essential G50-45 (80E300RGIN) Laptop (15.6 Inch | AMD Quad Core A8 | 8 GB | DOS | 1 TB HDD)::578763::computer::laptops,32500,2.0 Ghz Processor,2.0,16 GB Expandable,8 GB,DDR3 RAM,0,LED,15.6,"R5 M230 GPU, 2 GB",,NO SSD,1024 GB HDD Storage,65,Upto 4 Hrs Battery Life,R5M2302
3491,3901,Lenovo,Lenovo essential G50-45 (80E3014FIN) Laptop (15.6 Inch | AMD Quad Core A8 | 4 GB | Windows 8.1 | 500 GB HDD)::578768::computer::laptops,25890,AMD Quad-Core A8 APU,AMD,16 GB Expandable,4 GB,DDR3 RAM,2.0 Ghz Processor,LED,15.6,R5,,NO SSD,500 GB HDD Storage,65,Upto 4 Hrs Battery Life,R5
3599,4016,Apple,Apple MacBook Pro MR932HN/A Ultrabook (15.4 Inch | Core i7 8th Gen | 16 GB | macOS High Sierra | 256 GB SSD)::576740::computer::laptops,187990,Intel Core i7 (8th Gen),Intel,32 GB Expandable,16 GB,DDR4 RAM,2.2 Ghz Processor,LED,15.4,"Pro 555X GPU, 4 GB",,256 GB SSD Storage,No HDD,no,Upto 10 Hrs Battery Life,"Pro 555X GPU, 4 GB"
3699,4121,Apple,Apple MacBook Pro MR962HN/A Ultrabook (15.4 Inch | Core i7 8th Gen | 16 GB | macOS High Sierra | 256 GB SSD)::576737::computer::laptops,199900,Intel Core i7 (8th Gen),Intel,32 GB Expandable,16 GB,DDR4 RAM,2.2 Ghz Processor,LED,15.4,"Pro 555X GPU, 4 GB",,256 GB SSD Storage,No HDD,no,Upto 10 Hrs Battery Life,"Pro 555X GPU, 4 GB"


In [548]:
categorie = df['GPU_Brand'].unique()
print("Categorie trovate:", categorie)

Categorie trovate: ['MediaTek' 'AMD' 'NVIDIA' 'Intel' 'Apple' 'Qualcomm' 'NIVIDIA' 'Nvidia'
 'ARM' nan 'Microsoft' 'ATI']


In [549]:
missing_idx = df[df['GPU_Brand'].isna()].index.tolist()
print(missing_idx)


[3275, 3491, 3599, 3699]


In [550]:
df.at[3275,  'GPU_Brand'] = 'AMD'
df.at[3491,  'GPU_Brand'] = 'AMD'
df.at[3599,  'GPU_Brand'] = 'Apple'
df.at[3699, 'GPU_Brand'] = 'Apple'



cambio valuta da rupie indiane a euro

In [551]:
tasso_inr_eur = 0.0102

df['price_eur'] = (df['Price'] * tasso_inr_eur).round(2)

standardizzazione dei Processor_Name

In [552]:
df['Processor_Name'].value_counts()

Processor_Name
Intel Core i5 (11th Gen)              316
Intel Core i5 (12th Gen)              294
AMD Hexa-Core Ryzen 5                 245
AMD Octa-Core Ryzen 7                 239
Intel Core i3 (11th Gen)              189
Intel Core i5 (10th Gen)              177
Intel Core i5 (8th Gen)               174
Intel Core i7 (12th Gen)              171
Intel Core i3 (10th Gen)              153
Intel Core i7 (13th Gen)              136
Intel Core i5 (13th Gen)              134
AMD Quad-Core Ryzen 5                 108
Intel Celeron Dual-Core               104
Intel Core i7 (11th Gen)               92
Intel Core i3 (12th Gen)               90
Intel Core i3 (7th Gen)                72
Intel Core i7 (8th Gen)                68
AMD Dual-Core Ryzen 3                  67
Intel Core i3 (6th Gen)                62
Intel Core i5 (7th Gen)                60
Intel Core i7 (10th Gen)               58
Intel Core Ultra 7                     52
AMD Quad-Core Ryzen 3                  46
AMD Octa-Core Ryzen

In [553]:
df['Processor_Name'] = (
    df['Processor_Name']
      .str.replace(r'\bProcessor\b', '', regex=True)  # toglie solo la parola intera
      .str.replace(r'\s+', ' ',       regex=True)     # unisce eventuali doppi spazi
      .str.strip()                                   # rimuove spazi iniziali/finali
)

estrazione dei soli numeri per le categorie numeriche, escludendo eventuali caratteri aggiuntivi

In [554]:
# helper per pullulare numeri
import re
def estrai_numero(x):
    m = re.search(r'(\d+(\.\d+)?)', str(x))
    return float(m.group(1)) if m else None

# Price
df['price'] = df['price_eur'].apply(estrai_numero)

#Ghz
df['ghz'] = df['Ghz'].apply(estrai_numero)

# RAM (GB)
df['ram_gb']   = df['RAM'].apply(estrai_numero)

# SSD (GB)
df['ssd_gb']   = df['SSD'].apply(
    lambda x: 0 if str(x).strip().lower() == 'no ssd'
              else estrai_numero(x)
)
# HDD (se presente)
df['hdd_gb']   = df['HDD'].apply(estrai_numero).fillna(0)

# Display size (inch)
df['display_inch'] = df['Display'].apply(estrai_numero)

# Adapter wattaggio
df['adapter_w'] = df['Adapter'].apply(
    lambda x: 0 if str(x).strip().lower() == 'no'
              else estrai_numero(x)
)

# Battery life (ore)
def estrai_ore(x):
    # es. “Upto 7.30 Hrs” → 7.5
    nums = re.findall(r'(\d+(?:\.\d+)?)', str(x))
    if not nums: return None
    h = float(nums[0])
    # se c’è .30, consideralo come decimale
    return h
df['battery_hrs'] = df['Battery_Life'].apply(estrai_ore)

df['ram_expandable_gb'] = df['RAM_Expandable'].apply(
    lambda x: 0 if 'Not' in str(x) else estrai_numero(x)
)

df.head()


Unnamed: 0.1,Unnamed: 0,Brand,Name,Price,Processor_Name,Processor_Brand,RAM_Expandable,RAM,RAM_TYPE,Ghz,...,price_eur,price,ghz,ram_gb,ssd_gb,hdd_gb,display_inch,adapter_w,battery_hrs,ram_expandable_gb
0,0,HP,HP Chromebook 11A-NA0002MU (2E4N0PA) Laptop (11.6 Inch | MediaTek Octa Core | 4 GB | Google Chrome |)::585119::computer::laptops,22990,MediaTek Octa-core,MediaTek,Not Expandable,4 GB,DDR4 RAM,2.0 Ghz Processor,...,234.5,234.5,2.0,4.0,64.0,0.0,11.6,45.0,12.0,0.0
1,1,Lenovo,Lenovo Ideapad Slim 3 (82KU017KIN) Laptop (15.6 Inch | AMD Hexa Core Ryzen 5 | 8 GB | Windows 11 | 512 GB SSD)::594497::computer::laptops,36289,AMD Hexa-Core Ryzen 5,AMD,12 GB Expandable,8 GB,DDR4 RAM,4.0 Ghz Processor,...,370.15,370.15,4.0,8.0,512.0,0.0,15.6,65.0,11.0,12.0
2,3,Dell,Dell G15-5520 (D560822WIN9B) Laptop (15.6 Inch | Core i5 12th Gen | 16 GB | Windows 11 | 512 GB SSD)::595299::computer::laptops,78500,Intel Core i5 (12th Gen),Intel,32 GB Expandable,16 GB,DDR5 RAM,3.3 Ghz Processor,...,800.7,800.7,3.3,16.0,512.0,0.0,15.6,56.0,10.0,32.0
3,4,HP,HP 15s-fy5007TU (91R03PA) Laptop (15.6 Inch | Core i5 12th Gen | 8 GB | Windows 11 | 512 GB SSD)::616095::computer::laptops,55490,Intel Core i5 (12th Gen),Intel,8 GB Expandable,8 GB,DDR4 RAM,4.2 Ghz Processor,...,566.0,566.0,4.2,8.0,512.0,0.0,15.6,0.0,7.3,8.0
4,6,Infinix,Infinix Inbook Y2 Plus XL29 Laptop (15.6 Inch | Core i3 11th Gen | 8 GB | Windows 11 | 512 GB SSD)::615700::computer::laptops,21990,Intel Core i3 (11th Gen),Intel,Not Expandable,8 GB LP,LPDDR4X RAM,1.7 Ghz Processor,...,224.3,224.3,1.7,8.0,512.0,0.0,15.6,45.0,8.0,0.0


standardizzazione nel caso ci fossero stesse categorie ma con maiuscole o minuscole diverse

In [555]:
df['processor_brand'] = df['Processor_Brand'].str.lower()
df['gpu_brand']       = df['GPU_Brand'].str.lower()
df['ram_type']        = df['RAM_TYPE'].str.upper()

df.head()


Unnamed: 0.1,Unnamed: 0,Brand,Name,Price,Processor_Name,Processor_Brand,RAM_Expandable,RAM,RAM_TYPE,Ghz,...,ram_gb,ssd_gb,hdd_gb,display_inch,adapter_w,battery_hrs,ram_expandable_gb,processor_brand,gpu_brand,ram_type
0,0,HP,HP Chromebook 11A-NA0002MU (2E4N0PA) Laptop (11.6 Inch | MediaTek Octa Core | 4 GB | Google Chrome |)::585119::computer::laptops,22990,MediaTek Octa-core,MediaTek,Not Expandable,4 GB,DDR4 RAM,2.0 Ghz Processor,...,4.0,64.0,0.0,11.6,45.0,12.0,0.0,mediatek,mediatek,DDR4 RAM
1,1,Lenovo,Lenovo Ideapad Slim 3 (82KU017KIN) Laptop (15.6 Inch | AMD Hexa Core Ryzen 5 | 8 GB | Windows 11 | 512 GB SSD)::594497::computer::laptops,36289,AMD Hexa-Core Ryzen 5,AMD,12 GB Expandable,8 GB,DDR4 RAM,4.0 Ghz Processor,...,8.0,512.0,0.0,15.6,65.0,11.0,12.0,amd,amd,DDR4 RAM
2,3,Dell,Dell G15-5520 (D560822WIN9B) Laptop (15.6 Inch | Core i5 12th Gen | 16 GB | Windows 11 | 512 GB SSD)::595299::computer::laptops,78500,Intel Core i5 (12th Gen),Intel,32 GB Expandable,16 GB,DDR5 RAM,3.3 Ghz Processor,...,16.0,512.0,0.0,15.6,56.0,10.0,32.0,intel,nvidia,DDR5 RAM
3,4,HP,HP 15s-fy5007TU (91R03PA) Laptop (15.6 Inch | Core i5 12th Gen | 8 GB | Windows 11 | 512 GB SSD)::616095::computer::laptops,55490,Intel Core i5 (12th Gen),Intel,8 GB Expandable,8 GB,DDR4 RAM,4.2 Ghz Processor,...,8.0,512.0,0.0,15.6,0.0,7.3,8.0,intel,intel,DDR4 RAM
4,6,Infinix,Infinix Inbook Y2 Plus XL29 Laptop (15.6 Inch | Core i3 11th Gen | 8 GB | Windows 11 | 512 GB SSD)::615700::computer::laptops,21990,Intel Core i3 (11th Gen),Intel,Not Expandable,8 GB LP,LPDDR4X RAM,1.7 Ghz Processor,...,8.0,512.0,0.0,15.6,45.0,8.0,0.0,intel,intel,LPDDR4X RAM


rimozione colonne inutili

In [556]:
df_clean = df.drop(['Unnamed: 0','Price','price_eur','Ghz','RAM','SSD','HDD','GPU','Display','Adapter','Battery_Life','RAM_Expandable','Processor_Brand','GPU_Brand','RAM_TYPE'], axis=1)
df_clean.rename(columns={'gpu_standard': 'GPU'},
          inplace=True)
df_clean.head()

Unnamed: 0,Brand,Name,Processor_Name,Display_type,GPU,price,ghz,ram_gb,ssd_gb,hdd_gb,display_inch,adapter_w,battery_hrs,ram_expandable_gb,processor_brand,gpu_brand,ram_type
0,HP,HP Chromebook 11A-NA0002MU (2E4N0PA) Laptop (11.6 Inch | MediaTek Octa Core | 4 GB | Google Chrome |)::585119::computer::laptops,MediaTek Octa-core,LED,Integrated,234.5,2.0,4.0,64.0,0.0,11.6,45.0,12.0,0.0,mediatek,mediatek,DDR4 RAM
1,Lenovo,Lenovo Ideapad Slim 3 (82KU017KIN) Laptop (15.6 Inch | AMD Hexa Core Ryzen 5 | 8 GB | Windows 11 | 512 GB SSD)::594497::computer::laptops,AMD Hexa-Core Ryzen 5,LCD,Radeon,370.15,4.0,8.0,512.0,0.0,15.6,65.0,11.0,12.0,amd,amd,DDR4 RAM
2,Dell,Dell G15-5520 (D560822WIN9B) Laptop (15.6 Inch | Core i5 12th Gen | 16 GB | Windows 11 | 512 GB SSD)::595299::computer::laptops,Intel Core i5 (12th Gen),LCD,GeForce 3050,800.7,3.3,16.0,512.0,0.0,15.6,56.0,10.0,32.0,intel,nvidia,DDR5 RAM
3,HP,HP 15s-fy5007TU (91R03PA) Laptop (15.6 Inch | Core i5 12th Gen | 8 GB | Windows 11 | 512 GB SSD)::616095::computer::laptops,Intel Core i5 (12th Gen),LCD,Iris Xe,566.0,4.2,8.0,512.0,0.0,15.6,0.0,7.3,8.0,intel,intel,DDR4 RAM
4,Infinix,Infinix Inbook Y2 Plus XL29 Laptop (15.6 Inch | Core i3 11th Gen | 8 GB | Windows 11 | 512 GB SSD)::615700::computer::laptops,Intel Core i3 (11th Gen),LCD,Uhd,224.3,1.7,8.0,512.0,0.0,15.6,45.0,8.0,0.0,intel,intel,LPDDR4X RAM


In [557]:
print(df_clean.isna().sum())



Brand                  0
Name                   0
Processor_Name         0
Display_type           0
GPU                    0
price                  0
ghz                    0
ram_gb                 0
ssd_gb                 0
hdd_gb                 0
display_inch           1
adapter_w              0
battery_hrs          418
ram_expandable_gb      0
processor_brand        0
gpu_brand              0
ram_type               0
dtype: int64


rimozione NaN per la colonna display_inch

In [558]:
df_clean[df_clean['display_inch'].isna()].head()

Unnamed: 0,Brand,Name,Processor_Name,Display_type,GPU,price,ghz,ram_gb,ssd_gb,hdd_gb,display_inch,adapter_w,battery_hrs,ram_expandable_gb,processor_brand,gpu_brand,ram_type
774,HP,HP ENVY 15 ep1087TX (54B88PA) Laptop (| Core i9 11th Gen | 32 GB | Windows 11 | 1 TB SSD)::594930::computer::laptops,Intel Core i9 (11th Gen),LED,GeForce 3060,1931.88,4.8,32.0,1024.0,0.0,,83.0,7.15,0.0,intel,nvidia,DDR4 RAM


In [559]:
print(df_clean[df_clean['display_inch'].isna()].Name.tolist())

['HP ENVY 15 ep1087TX (54B88PA) Laptop (| Core i9 11th Gen | 32 GB | Windows 11 | 1 TB SSD)::594930::computer::laptops']


In [560]:
indici_nan_display = df_clean[df_clean['Name'] == 'HP ENVY 15 ep1087TX (54B88PA) Laptop (| Core i9 11th Gen | 32 GB | Windows 11 | 1 TB SSD)::594930::computer::laptops'].index.tolist()
print(indici_nan_display)


[774]


trovato manualmente in internet ed inserito

In [561]:
df_clean.at[774,  'display_inch'] = 15.6

In [562]:
print(df_clean.isna().sum())

Brand                  0
Name                   0
Processor_Name         0
Display_type           0
GPU                    0
price                  0
ghz                    0
ram_gb                 0
ssd_gb                 0
hdd_gb                 0
display_inch           0
adapter_w              0
battery_hrs          418
ram_expandable_gb      0
processor_brand        0
gpu_brand              0
ram_type               0
dtype: int64


i NaN in battery_hrs verrano lasciati per consentire al chatbot di rispondere che non si hanno informazioni

In [563]:
df_clean.head()

Unnamed: 0,Brand,Name,Processor_Name,Display_type,GPU,price,ghz,ram_gb,ssd_gb,hdd_gb,display_inch,adapter_w,battery_hrs,ram_expandable_gb,processor_brand,gpu_brand,ram_type
0,HP,HP Chromebook 11A-NA0002MU (2E4N0PA) Laptop (11.6 Inch | MediaTek Octa Core | 4 GB | Google Chrome |)::585119::computer::laptops,MediaTek Octa-core,LED,Integrated,234.5,2.0,4.0,64.0,0.0,11.6,45.0,12.0,0.0,mediatek,mediatek,DDR4 RAM
1,Lenovo,Lenovo Ideapad Slim 3 (82KU017KIN) Laptop (15.6 Inch | AMD Hexa Core Ryzen 5 | 8 GB | Windows 11 | 512 GB SSD)::594497::computer::laptops,AMD Hexa-Core Ryzen 5,LCD,Radeon,370.15,4.0,8.0,512.0,0.0,15.6,65.0,11.0,12.0,amd,amd,DDR4 RAM
2,Dell,Dell G15-5520 (D560822WIN9B) Laptop (15.6 Inch | Core i5 12th Gen | 16 GB | Windows 11 | 512 GB SSD)::595299::computer::laptops,Intel Core i5 (12th Gen),LCD,GeForce 3050,800.7,3.3,16.0,512.0,0.0,15.6,56.0,10.0,32.0,intel,nvidia,DDR5 RAM
3,HP,HP 15s-fy5007TU (91R03PA) Laptop (15.6 Inch | Core i5 12th Gen | 8 GB | Windows 11 | 512 GB SSD)::616095::computer::laptops,Intel Core i5 (12th Gen),LCD,Iris Xe,566.0,4.2,8.0,512.0,0.0,15.6,0.0,7.3,8.0,intel,intel,DDR4 RAM
4,Infinix,Infinix Inbook Y2 Plus XL29 Laptop (15.6 Inch | Core i3 11th Gen | 8 GB | Windows 11 | 512 GB SSD)::615700::computer::laptops,Intel Core i3 (11th Gen),LCD,Uhd,224.3,1.7,8.0,512.0,0.0,15.6,45.0,8.0,0.0,intel,intel,LPDDR4X RAM
