In [None]:
import zipfile
import pandas as pd  # oppure: import csv

zip_path     = 'dataset_laptop.zip'
csv_inside   = 'laptop.csv'   # percorso interno al .zip

with zipfile.ZipFile(zip_path, 'r') as z:
    with z.open(csv_inside) as f:
        # Se vuoi usare pandas:
        df = pd.read_csv(f)
df.head()

In [None]:
df.columns

individuazione NaN

In [None]:
print(df.isna().sum())

sostituzione dei NaN (dati mancanti) in battery_hrs in -1 che poi verranno gestiti dal chatbot

In [None]:
df['Battery_Life'] = df['Battery_Life'].fillna(-1)

rimozione NaN per la colonna GPU

In [None]:
df['GPU'].value_counts()

tentativo di standardizzazione dei nomi GPU dato che molte erano le stesse GPU ma scritte in modo diverso

In [None]:
import re
import pandas as pd

def standardizza_gpu(raw: str) -> str:
    s = str(raw).lower().strip()
    # 1. rimuovo parole chiave ridondanti
    s = re.sub(r'\bgpu\b', '', s)
    s = re.sub(r'\bgraphics?\b', '', s)
    s = re.sub(r'\bgb\b', '', s)
    # 2. uniformo varianti max-q / max q
    s = re.sub(r'max[\s-]?q', 'max-q', s)
    # 3. pulisco punteggiatura
    s = re.sub(r'[,/()]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    # 4. estraggo brand+modello
    #   a) NVIDIA GeForce (RTX/GTX/MX)
    m = re.search(r'(geforce|gtx|rtx|mx)\s*([0-9]{3,4})', s)
    if m:
        return f"GeForce {m.group(2)}"
    #   b) AMD Radeon RX/Vega
    m = re.search(r'(radeon)\s*(rx\s*[0-9]{3,4}|vega\s*\d+)', s)
    if m:
        modello = m.group(2).replace(' ', '').upper()
        return f"Radeon {modello}"
    #   c) Intel Iris / UHD / HD
    m = re.search(r'(iris xe|max-q|iris plus|uhd|hd)\s*(\d{3})?', s)
    if m:
        name = m.group(1).title()
        num  = m.group(2) or ''
        return f"{name} {num}".strip()
    #   d) Apple M1/M2
    if 'm1' in s or 'm2' in s:
        return s.upper().replace(' ', '')
    #   e) Adreno, Arc, Quadro, T-series
    m = re.search(r'(adreno|arc|quadro|t\d{3,4})', s)
    if m:
        return m.group(1).title()
    # 5. integrated generic
    if 'integrated' in s:
        return 'Integrated'
    # 6. fallback: ripropongo il raw originale (o 'Other')
    return raw

# Applico la funzione al DataFrame
df['gpu_standard'] = df['GPU'].apply(standardizza_gpu)

# Controllo le nuove categorie
print(df['gpu_standard'].value_counts())

In [None]:
df[df['gpu_standard'].isna()].head(8)

In [None]:
serie_idx = pd.Series(df[df['gpu_standard'].isna()].Name, name='Name of Laptop without GPU')
print(serie_idx)

In [None]:
indici_nan_gpu = df[df['gpu_standard'].isna()].index.tolist()
print(indici_nan_gpu)

inserimento manuale dopo ricerca su internet

In [None]:
df.at[1008,  'gpu_standard'] = 'Iris Xe'
df.at[1052,  'gpu_standard'] = 'Integrated'
df.at[1117,  'gpu_standard'] = 'Iris Xe'
df.at[2147, 'gpu_standard'] = 'Iris Xe'
df.at[2153,  'gpu_standard'] = 'Iris Xe'
df.at[2621, 'gpu_standard'] = 'Integrated'
df.at[2625,  'gpu_standard'] = 'Iris Xe'
df.at[2655, 'gpu_standard'] = 'Integrated'

rimozione NaN per la colonna GPU_Brand

In [None]:
df[df['GPU_Brand'].isna()].head()

In [None]:
categorie = df['GPU_Brand'].unique()
print("Categorie trovate:", categorie)

In [None]:
missing_idx = df[df['GPU_Brand'].isna()].index.tolist()
print(missing_idx)


In [None]:
df.at[3275,  'GPU_Brand'] = 'AMD'
df.at[3491,  'GPU_Brand'] = 'AMD'
df.at[3599,  'GPU_Brand'] = 'Apple'
df.at[3699, 'GPU_Brand'] = 'Apple'



cambio valuta da rupie indiane a euro

In [None]:
tasso_inr_eur = 0.0102

df['price_eur'] = (df['Price'] * tasso_inr_eur).round(2)

standardizzazione dei Processor_Name

In [None]:
df['Processor_Name'].value_counts()

In [None]:
df['Processor_Name'] = (
    df['Processor_Name']
      .str.replace(r'\bProcessor\b', '', regex=True)  # toglie solo la parola intera
      .str.replace(r'\s+', ' ',       regex=True)     # unisce eventuali doppi spazi
      .str.strip()                                   # rimuove spazi iniziali/finali
)

estrazione dei soli numeri per le categorie numeriche, escludendo eventuali caratteri aggiuntivi

In [None]:
# helper per pullulare numeri
import re
def estrai_numero(x):
    m = re.search(r'(\d+(\.\d+)?)', str(x))
    return float(m.group(1)) if m else None

# Price
df['price'] = df['price_eur'].apply(estrai_numero)

#Ghz
df['ghz'] = df['Ghz'].apply(estrai_numero)

# RAM (GB)
df['ram_gb']   = df['RAM'].apply(estrai_numero)

# SSD (GB)
df['ssd_gb']   = df['SSD'].apply(
    lambda x: 0 if str(x).strip().lower() == 'no ssd'
              else estrai_numero(x)
)
# HDD (se presente)
df['hdd_gb']   = df['HDD'].apply(estrai_numero).fillna(0)

# Display size (inch)
df['display_inch'] = df['Display'].apply(estrai_numero)

# Adapter wattaggio
df['adapter_w'] = df['Adapter'].apply(
    lambda x: 0 if str(x).strip().lower() == 'no'
              else estrai_numero(x)
)

# Battery life (ore)
def estrai_ore(x):
    # es. “Upto 7.30 Hrs” → 7.5
    nums = re.findall(r'(\d+(?:\.\d+)?)', str(x))
    if not nums: return None
    h = float(nums[0])
    # se c’è .30, consideralo come decimale
    return h
df['battery_hrs'] = df['Battery_Life'].apply(estrai_ore)

df['ram_expandable_gb'] = df['RAM_Expandable'].apply(
    lambda x: 0 if 'Not' in str(x) else estrai_numero(x)
)

df.head()


rimozione informazioni ridondanti nel nome del modello

In [686]:
num_righe = df.shape[0]
print(f"Numero di righe nel dataset: {num_righe}")

count_laptop = df['Name'].str.contains('Laptop', case=False, na=False).sum()
print(f"Numero di laptop nel dataset: {count_laptop}")


Numero di righe nel dataset: 3976
Numero di laptop nel dataset: 3976


In [687]:
df['Name'] = df['Name'].str.split('Laptop', n=1).str[0]
df['Name'] = df['Name'].str.split('Ultrabook', n=1).str[0]

rimozione colonne inutili

In [688]:
df_clean = df.drop(['Unnamed: 0','Price','price_eur','Ghz','RAM','SSD','HDD','GPU','Display','Adapter','Battery_Life','RAM_Expandable'], axis=1)
df_clean.rename(columns={'gpu_standard': 'gpu',                     
                         'Processor_Brand': 'processor_brand',
                         'GPU_Brand': 'gpu_brand',
                         'RAM_TYPE': 'ram_type',
                         'Processor_Name': 'processor_name',
                         'Name': 'name',
                         'Brand': 'brand',
                         'Display_type': 'display_type',},
          inplace=True)
df_clean.head()

Unnamed: 0,brand,name,processor_name,processor_brand,ram_type,display_type,gpu_brand,gpu,price,ghz,ram_gb,ssd_gb,hdd_gb,display_inch,adapter_w,battery_hrs,ram_expandable_gb
0,HP,HP Chromebook 11A-NA0002MU (2E4N0PA),MediaTek Octa-core,MediaTek,DDR4 RAM,LED,MediaTek,Integrated,234.5,2.0,4.0,64.0,0.0,11.6,45.0,12.0,0.0
1,Lenovo,Lenovo Ideapad Slim 3 (82KU017KIN),AMD Hexa-Core Ryzen 5,AMD,DDR4 RAM,LCD,AMD,Radeon,370.15,4.0,8.0,512.0,0.0,15.6,65.0,11.0,12.0
2,Dell,Dell G15-5520 (D560822WIN9B),Intel Core i5 (12th Gen),Intel,DDR5 RAM,LCD,NVIDIA,GeForce 3050,800.7,3.3,16.0,512.0,0.0,15.6,56.0,10.0,32.0
3,HP,HP 15s-fy5007TU (91R03PA),Intel Core i5 (12th Gen),Intel,DDR4 RAM,LCD,Intel,Iris Xe,566.0,4.2,8.0,512.0,0.0,15.6,0.0,7.3,8.0
4,Infinix,Infinix Inbook Y2 Plus XL29,Intel Core i3 (11th Gen),Intel,LPDDR4X RAM,LCD,Intel,Uhd,224.3,1.7,8.0,512.0,0.0,15.6,45.0,8.0,0.0


In [689]:
print(df_clean.isna().sum())



brand                0
name                 0
processor_name       0
processor_brand      0
ram_type             0
display_type         0
gpu_brand            0
gpu                  0
price                0
ghz                  0
ram_gb               0
ssd_gb               0
hdd_gb               0
display_inch         1
adapter_w            0
battery_hrs          0
ram_expandable_gb    0
dtype: int64


rimozione NaN per la colonna display_inch

In [690]:
df_clean[df_clean['display_inch'].isna()].head()

Unnamed: 0,brand,name,processor_name,processor_brand,ram_type,display_type,gpu_brand,gpu,price,ghz,ram_gb,ssd_gb,hdd_gb,display_inch,adapter_w,battery_hrs,ram_expandable_gb
774,HP,HP ENVY 15 ep1087TX (54B88PA),Intel Core i9 (11th Gen),Intel,DDR4 RAM,LED,NVIDIA,GeForce 3060,1931.88,4.8,32.0,1024.0,0.0,,83.0,7.15,0.0


In [691]:
print(df_clean[df_clean['display_inch'].isna()].name.tolist())

['HP ENVY 15 ep1087TX (54B88PA) ']


In [692]:
indici_nan_display = df_clean[df_clean['name'] == 'HP ENVY 15 ep1087TX (54B88PA) Laptop (| Core i9 11th Gen | 32 GB | Windows 11 | 1 TB SSD)::594930::computer::laptops'].index.tolist()
print(indici_nan_display)


[]


trovato manualmente in internet ed inserito

In [693]:
df_clean.at[774,  'display_inch'] = 15.6

In [694]:
print(df_clean.isna().sum())

brand                0
name                 0
processor_name       0
processor_brand      0
ram_type             0
display_type         0
gpu_brand            0
gpu                  0
price                0
ghz                  0
ram_gb               0
ssd_gb               0
hdd_gb               0
display_inch         0
adapter_w            0
battery_hrs          0
ram_expandable_gb    0
dtype: int64


In [695]:
df_clean.head()

Unnamed: 0,brand,name,processor_name,processor_brand,ram_type,display_type,gpu_brand,gpu,price,ghz,ram_gb,ssd_gb,hdd_gb,display_inch,adapter_w,battery_hrs,ram_expandable_gb
0,HP,HP Chromebook 11A-NA0002MU (2E4N0PA),MediaTek Octa-core,MediaTek,DDR4 RAM,LED,MediaTek,Integrated,234.5,2.0,4.0,64.0,0.0,11.6,45.0,12.0,0.0
1,Lenovo,Lenovo Ideapad Slim 3 (82KU017KIN),AMD Hexa-Core Ryzen 5,AMD,DDR4 RAM,LCD,AMD,Radeon,370.15,4.0,8.0,512.0,0.0,15.6,65.0,11.0,12.0
2,Dell,Dell G15-5520 (D560822WIN9B),Intel Core i5 (12th Gen),Intel,DDR5 RAM,LCD,NVIDIA,GeForce 3050,800.7,3.3,16.0,512.0,0.0,15.6,56.0,10.0,32.0
3,HP,HP 15s-fy5007TU (91R03PA),Intel Core i5 (12th Gen),Intel,DDR4 RAM,LCD,Intel,Iris Xe,566.0,4.2,8.0,512.0,0.0,15.6,0.0,7.3,8.0
4,Infinix,Infinix Inbook Y2 Plus XL29,Intel Core i3 (11th Gen),Intel,LPDDR4X RAM,LCD,Intel,Uhd,224.3,1.7,8.0,512.0,0.0,15.6,45.0,8.0,0.0


In [696]:
df_clean.to_csv("laptop.csv", index=False)