# Chatbot sulla scelta del PC Portatile migliore per l'utente

## ETL

In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import ast
import scipy.stats as stats
import plotly.express as px
import re

In [96]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [97]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors

In [98]:
file = 'laptops.csv'
laptops = pd.read_csv(file)
laptops.head()

Unnamed: 0.1,Unnamed: 0,img_link,name,price(in Rs.),processor,ram,os,storage,display(in inch),rating,no_of_ratings,no_of_reviews
0,0,https://rukminim1.flixcart.com/image/312/312/x...,Lenovo Intel Core i5 11th Gen,62990,Intel Core i5 Processor (11th Gen),16 GB DDR4 RAM,Windows 11 Operating System,512 GB SSD,15.6,4.5,14.0,1.0
1,1,https://rukminim1.flixcart.com/image/312/312/x...,Lenovo V15 G2 Core i3 11th Gen,37500,Intel Core i3 Processor (11th Gen),8 GB DDR4 RAM,64 bit Windows 11 Operating System,1 TB HDD|256 GB SSD,15.6,4.4,53.0,3.0
2,2,https://rukminim1.flixcart.com/image/312/312/l...,ASUS TUF Gaming F15 Core i5 10th Gen,49990,Intel Core i5 Processor (10th Gen),8 GB DDR4 RAM,Windows 11 Operating System,512 GB SSD,15.6,4.4,4733.0,463.0
3,3,https://rukminim1.flixcart.com/image/312/312/x...,ASUS VivoBook 15 (2022) Core i3 10th Gen,33990,Intel Core i3 Processor (10th Gen),8 GB DDR4 RAM,64 bit Windows 11 Operating System,512 GB SSD,15.6,4.3,10406.0,1040.0
4,4,https://rukminim1.flixcart.com/image/312/312/x...,Lenovo Athlon Dual Core,18990,AMD Athlon Dual Core Processor,4 GB DDR4 RAM,DOS Operating System,256 GB SSD,14.0,3.8,18.0,3.0


In [99]:
laptops.isnull().sum()

Unnamed: 0            0
img_link              0
name                  0
price(in Rs.)         0
processor             0
ram                   0
os                    0
storage               0
display(in inch)      0
rating              296
no_of_ratings       296
no_of_reviews       296
dtype: int64

In [100]:
laptops= laptops.dropna()
laptops.to_csv('laptops_cleaned.csv', index=False)
laptops.isnull().sum()

Unnamed: 0          0
img_link            0
name                0
price(in Rs.)       0
processor           0
ram                 0
os                  0
storage             0
display(in inch)    0
rating              0
no_of_ratings       0
no_of_reviews       0
dtype: int64

In [101]:
# Carica il dataset
df = pd.read_csv("laptops_cleaned.csv")

# Controlliamo che la colonna "price(in Rs.)" esista nel dataset
if "price(in Rs.)" in df.columns:
    # Conversione INR -> EUR (tasso di cambio approssimativo)
    exchange_rate = 0.011
    df["price(in EUR)"] = df["price(in Rs.)"] * exchange_rate

    # Rimuove la colonna con la valuta vecchia
    df.drop(columns=["price(in Rs.)"], inplace=True)

    # Salva il nuovo dataset con i prezzi aggiornati
    new_filename = "laptops_etl.csv"
    df.to_csv(new_filename, index=False)

    print(f"✅ Conversione completata! Dataset salvato come '{new_filename}'.")

    # Mostra un'anteprima per verifica
    display(df.head())
else:
    print("❌ Errore: la colonna 'price(in Rs.)' non è presente nel dataset.")

✅ Conversione completata! Dataset salvato come 'laptops_etl.csv'.


Unnamed: 0.1,Unnamed: 0,img_link,name,processor,ram,os,storage,display(in inch),rating,no_of_ratings,no_of_reviews,price(in EUR)
0,0,https://rukminim1.flixcart.com/image/312/312/x...,Lenovo Intel Core i5 11th Gen,Intel Core i5 Processor (11th Gen),16 GB DDR4 RAM,Windows 11 Operating System,512 GB SSD,15.6,4.5,14.0,1.0,692.89
1,1,https://rukminim1.flixcart.com/image/312/312/x...,Lenovo V15 G2 Core i3 11th Gen,Intel Core i3 Processor (11th Gen),8 GB DDR4 RAM,64 bit Windows 11 Operating System,1 TB HDD|256 GB SSD,15.6,4.4,53.0,3.0,412.5
2,2,https://rukminim1.flixcart.com/image/312/312/l...,ASUS TUF Gaming F15 Core i5 10th Gen,Intel Core i5 Processor (10th Gen),8 GB DDR4 RAM,Windows 11 Operating System,512 GB SSD,15.6,4.4,4733.0,463.0,549.89
3,3,https://rukminim1.flixcart.com/image/312/312/x...,ASUS VivoBook 15 (2022) Core i3 10th Gen,Intel Core i3 Processor (10th Gen),8 GB DDR4 RAM,64 bit Windows 11 Operating System,512 GB SSD,15.6,4.3,10406.0,1040.0,373.89
4,4,https://rukminim1.flixcart.com/image/312/312/x...,Lenovo Athlon Dual Core,AMD Athlon Dual Core Processor,4 GB DDR4 RAM,DOS Operating System,256 GB SSD,14.0,3.8,18.0,3.0,208.89


In [102]:
laptosp_etl = df
laptosp_etl.isnull().sum()

Unnamed: 0          0
img_link            0
name                0
processor           0
ram                 0
os                  0
storage             0
display(in inch)    0
rating              0
no_of_ratings       0
no_of_reviews       0
price(in EUR)       0
dtype: int64

In [103]:
laptosp_etl= laptosp_etl.dropna()
laptosp_etl.to_csv("laptops_etl_clean.csv", index=False)
laptops_etl_clean = pd.read_csv("laptops_etl_clean.csv")
laptops_etl_clean.isnull().sum()

Unnamed: 0          0
img_link            0
name                0
processor           0
ram                 0
os                  0
storage             0
display(in inch)    0
rating              0
no_of_ratings       0
no_of_reviews       0
price(in EUR)       0
dtype: int64

In [104]:
# Carica il dataset
df = pd.read_csv("laptops_etl_clean.csv")

# Rimuovi i duplicati basandoti sulle colonne "name", "ram", "processor" e "storage"
df = df.drop_duplicates(subset=["name", "ram", "processor", "storage"])
print("Duplicati rimossi.")

# Definizione delle keywords che indicano l'inizio delle specifiche tecniche
keywords = ["Intel Core", "Core", "Ryzen", "Pentium", "Celeron", "M1", "M2"]

def clean_name(name):
    """
    Restituisce la parte di 'name' precedente alla prima occorrenza di una keyword.
    Se non viene trovata nessuna keyword, restituisce il nome intero.
    """
    indices = []
    for kw in keywords:
        match = re.search(re.escape(kw), name, re.IGNORECASE)
        if match:
            indices.append(match.start())
    if indices:
        cutoff = min(indices)
        return name[:cutoff].strip()
    else:
        return name.strip()

# Sovrascrivi la colonna "name" con la versione pulita
df["name"] = df["name"].apply(clean_name)

# Salva il nuovo dataset in un file CSV (senza l'indice)
df.to_csv("laptops_etl_clean.csv", index=False)

print("Dataset aggiornato: duplicati rimossi e colonna 'name' pulita. Il nuovo file è stato salvato come 'laptops_etl_clean.csv'.")


Duplicati rimossi.
Dataset aggiornato: duplicati rimossi e colonna 'name' pulita. Il nuovo file è stato salvato come 'laptops_etl_clean.csv'.


In [106]:
# Ottieni i nomi delle colonne
columns_to_check = df.columns[2:3]

# Crea un dizionario per memorizzare le istanze univoche per ogni colonna
unique_instances = {}

# Itera attraverso le colonne e ottieni le istanze univoche
for col in columns_to_check:
    unique_instances[col] = df[col].unique()

# Stampa le istanze univoche per ogni colonna
for col, unique_vals in unique_instances.items():
    print(f"Colonna: {col}")
    print(unique_vals)
    print()

Colonna: name
['Lenovo' 'Lenovo V15 G2' 'ASUS TUF Gaming F15' 'ASUS VivoBook 15 (2022)'
 'Lenovo Athlon Dual' 'APPLE 2020 Macbook Air' 'ASUS VivoBook 14 (2021)'
 'RedmiBook Pro' 'acer Aspire 3' 'ASUS Vivobook 14 (2022)'
 'ASUS Vivobook 15' 'ASUS' 'DELL Vostro' 'realme Book (Slim)' 'HP 14s'
 'HP' 'MSI Bravo 15' 'ASUS Zenbook Flip 14 OLED (2022) Touch Panel'
 'HP 15s' 'Lenovo IdeaPad 3' 'HP Athlon Dual'
 'ASUS VivoBook K15 OLED (2022)' 'ASUS ROG Strix G15 (2022)'
 'ASUS Vivobook Pro 15' 'SAMSUNG Galaxy Book Go Snapdragon 7c Gen 2'
 'HP Pavilion' 'acer Swift 3'
 'ASUS TUF Gaming F17 (2022) with 90Whr Battery' 'ASUS VivoBook 14'
 'acer Extensa' 'ASUS VivoBook 14 (2022)' 'DELL'
 'ASUS Vivobook Ultra 14 (2022)' 'ASUS Zenbook 14X (2022) Space Edition'
 'MSI Alpha 15 AMD Advantage Edition' 'ASUS VivoBook K15 OLED'
 'acer Aspire 7' 'ASUS Chromebook' 'ASUS Chromebook Flip Touch'
 'ASUS Vivobook 16X' 'Infinix X1 Slim Series' 'Lenovo IdeaPad'
 'ASUS ZenBook Duo 14 (2021) Touch Panel' 'acer Aspire 

In [107]:
# Ottieni i nomi delle colonne
columns_to_check = df.columns[3:4]

# Crea un dizionario per memorizzare le istanze univoche per ogni colonna
unique_instances = {}

# Itera attraverso le colonne e ottieni le istanze univoche
for col in columns_to_check:
    unique_instances[col] = df[col].unique()

# Stampa le istanze univoche per ogni colonna
for col, unique_vals in unique_instances.items():
    print(f"Colonna: {col}")
    print(unique_vals)
    print()

Colonna: processor
['Intel Core i5 Processor (11th Gen)' 'Intel Core i3 Processor (11th Gen)'
 'Intel Core i5 Processor (10th Gen)' 'Intel Core i3 Processor (10th Gen)'
 'AMD Athlon Dual Core Processor' 'Apple M1 Processor'
 'Intel Celeron Dual Core Processor' 'AMD Ryzen 3 Dual Core Processor'
 'Intel Core i5 Processor (12th Gen)' 'Intel Core i7 Processor (11th Gen)'
 'AMD Ryzen 5 Hexa Core Processor' 'Intel Core i3 Processor (12th Gen)'
 'AMD Ryzen 3 Quad Core Processor' 'AMD Ryzen 7 Octa Core Processor'
 'Qualcomm Snapdragon 7c Gen 2 Processor'
 'Intel Core i7 Processor (12th Gen)' 'Intel Pentium Silver Processor'
 'AMD Ryzen 5 Quad Core Processor' 'Intel Core i9 Processor (12th Gen)'
 'AMD Dual Core Processor' 'Apple M2 Processor'
 'AMD Ryzen 9 Octa Core Processor' 'Apple M1 Max Processor'
 'Apple M1 Pro Processor' 'Intel Pentium Quad Core Processor'
 'Intel Core i7 Processor (10th Gen)'
 'AMD Ryzen 9 Octa Core Processor (5th Gen)'
 'Intel Core i9 Processor (10th Gen)' 'Intel Celero

In [108]:
# Ottieni i nomi delle colonne
columns_to_check = df.columns[4:5]

# Crea un dizionario per memorizzare le istanze univoche per ogni colonna
unique_instances = {}

# Itera attraverso le colonne e ottieni le istanze univoche
for col in columns_to_check:
    unique_instances[col] = df[col].unique()

# Stampa le istanze univoche per ogni colonna
for col, unique_vals in unique_instances.items():
    print(f"Colonna: {col}")
    print(unique_vals)
    print()

Colonna: ram
['16 GB DDR4 RAM' '8 GB DDR4 RAM' '4 GB DDR4 RAM' '16 GB LPDDR5 RAM'
 '16 GB DDR5 RAM' '4 GB LPDDR4X RAM' '8 GB LPDDR4X RAM' '32 GB LPDDR5 RAM'
 '4 GB LPDDR4 RAM' '16 GB LPDDR4X RAM' '8 GB DDR5 RAM'
 '8 GB Unified Memory RAM' '32 GB Unified Memory RAM'
 '16 GB Unified Memory RAM' '32 GB DDR5 RAM' '32 GB DDR4 RAM'
 '8 GB DDR3 RAM' '8 GB LPDDR3 RAM' '16 GB LPDDR3 RAM' '16 GB DDR3 RAM']



In [109]:
# Ottieni i nomi delle colonne
columns_to_check = df.columns[5:6]

# Crea un dizionario per memorizzare le istanze univoche per ogni colonna
unique_instances = {}

# Itera attraverso le colonne e ottieni le istanze univoche
for col in columns_to_check:
    unique_instances[col] = df[col].unique()

# Stampa le istanze univoche per ogni colonna
for col, unique_vals in unique_instances.items():
    print(f"Colonna: {col}")
    print(unique_vals)
    print()

Colonna: os
['Windows 11 Operating System' '64 bit Windows 11 Operating System'
 'DOS Operating System' 'Mac OS Operating System'
 '64 bit Windows 10 Operating System' '32 bit Windows 11 Operating System'
 'Chrome Operating System' '64 bit Chrome Operating System'
 'Windows 10 Operating System' '64 bit DOS Operating System'
 '64 bit Windows 8 Operating System']



In [110]:
# Ottieni i nomi delle colonne
columns_to_check = df.columns[6:7]

# Crea un dizionario per memorizzare le istanze univoche per ogni colonna
unique_instances = {}

# Itera attraverso le colonne e ottieni le istanze univoche
for col in columns_to_check:
    unique_instances[col] = df[col].unique()

# Stampa le istanze univoche per ogni colonna
for col, unique_vals in unique_instances.items():
    print(f"Colonna: {col}")
    print(unique_vals)
    print()

Colonna: storage
['512 GB SSD' '1 TB HDD|256 GB SSD' '256 GB SSD' '1 TB SSD' '2 TB SSD'
 '1 TB HDD|512 GB SSD' '1 TB HDD' '128 GB SSD' '256 GB HDD|256 GB SSD'
 '1 TB HDD|128 GB SSD'
 'PCI-e SSD (NVMe) ready,Silver-Lining Print Keyboard,Matrix Display (Extend),Cooler Boost 5,Hi-Res Audio,Nahimic 3,144Hz Panel,Thin Bezel,RGB Gaming Keyboard,Speaker Tuning Engine,MSI Center'
 'PCI-e Gen4 SSD?SHIFT?Matrix Display (Extend)?Cooler Boost 3?Thunderbolt 4?Finger Print Security?True Color 2.0?Hi-Res Audio?Nahimic 3? 4-Sided Thin bezel?MSI Center?Silky Smooth Touchpad?Military-Grade Durability'
 '2 TB HDD' '512 GB HDD|512 GB SSD' '256 GB HDD']



In [111]:
# Ottieni i nomi delle colonne
columns_to_check = df.columns[7:8]

# Crea un dizionario per memorizzare le istanze univoche per ogni colonna
unique_instances = {}

# Itera attraverso le colonne e ottieni le istanze univoche
for col in columns_to_check:
    unique_instances[col] = df[col].unique()

# Stampa le istanze univoche per ogni colonna
for col, unique_vals in unique_instances.items():
    print(f"Colonna: {col}")
    print(unique_vals)
    print()

Colonna: display(in inch)
[15.6  14.   13.3  17.3  11.6  16.   16.1  13.6  13.4  16.2  14.2  14.1
 14.96 35.   16.6  15.  ]



In [112]:
# Ottieni i nomi delle colonne
columns_to_check = df.columns[8:9]

# Crea un dizionario per memorizzare le istanze univoche per ogni colonna
unique_instances = {}

# Itera attraverso le colonne e ottieni le istanze univoche
for col in columns_to_check:
    unique_instances[col] = df[col].unique()

# Stampa le istanze univoche per ogni colonna
for col, unique_vals in unique_instances.items():
    print(f"Colonna: {col}")
    print(unique_vals)
    print()

Colonna: rating
[4.5 4.4 4.3 3.8 4.7 4.1 4.  4.2 5.  3.7 3.6 4.9 4.8 4.6 3.1 3.9 3.  3.5
 3.3 1.6]



In [113]:
# Ottieni i nomi delle colonne
columns_to_check = df.columns[9:10]

# Crea un dizionario per memorizzare le istanze univoche per ogni colonna
unique_instances = {}

# Itera attraverso le colonne e ottieni le istanze univoche
for col in columns_to_check:
    unique_instances[col] = df[col].unique()

# Stampa le istanze univoche per ogni colonna
for col, unique_vals in unique_instances.items():
    print(f"Colonna: {col}")
    print(unique_vals)
    print()

Colonna: no_of_ratings
[1.4000e+01 5.3000e+01 4.7330e+03 1.0406e+04 1.8000e+01 8.8650e+03
 7.9000e+02 1.1200e+02 3.9550e+03 3.8100e+02 4.0000e+00 2.5700e+02
 1.3700e+02 1.4300e+02 1.2584e+04 1.7790e+03 3.2800e+02 2.2390e+03
 9.0000e+00 9.3000e+01 6.8000e+01 1.9110e+03 8.7000e+02 1.9000e+01
 7.5400e+02 4.5300e+02 1.9790e+03 1.2330e+03 3.8800e+02 1.2700e+02
 2.0110e+03 4.6580e+03 1.0900e+02 1.2800e+02 3.2500e+02 2.8000e+02
 5.9000e+01 3.0100e+02 2.4700e+02 2.3300e+02 7.4100e+02 4.3100e+02
 1.6030e+03 2.8000e+01 3.8900e+02 2.0600e+02 6.0000e+00 4.1300e+02
 1.0870e+03 7.0000e+00 2.5000e+01 5.5330e+03 6.5200e+02 3.2700e+02
 1.6710e+03 1.8530e+03 5.0000e+00 4.3790e+03 5.7000e+01 6.9400e+02
 7.7800e+02 2.3000e+01 5.6400e+02 1.4200e+02 9.7000e+01 1.4600e+02
 9.4700e+02 6.2400e+02 3.0000e+00 7.1500e+02 6.2000e+01 3.2000e+01
 4.5000e+01 1.2660e+03 1.7400e+02 3.5720e+03 1.3000e+01 3.7400e+02
 7.1900e+02 4.1000e+01 1.8700e+02 1.1000e+01 3.0650e+03 1.3600e+02
 7.2000e+03 1.3460e+03 7.1300e+02 4.400

In [114]:
# Ottieni i nomi delle colonne
columns_to_check = df.columns[10:11]

# Crea un dizionario per memorizzare le istanze univoche per ogni colonna
unique_instances = {}

# Itera attraverso le colonne e ottieni le istanze univoche
for col in columns_to_check:
    unique_instances[col] = df[col].unique()

# Stampa le istanze univoche per ogni colonna
for col, unique_vals in unique_instances.items():
    print(f"Colonna: {col}")
    print(unique_vals)
    print()

Colonna: no_of_reviews
[1.000e+00 3.000e+00 4.630e+02 1.040e+03 7.950e+02 9.500e+01 8.000e+00
 4.620e+02 7.600e+01 0.000e+00 2.600e+01 1.800e+01 1.100e+01 1.870e+03
 1.600e+02 3.400e+01 2.990e+02 1.700e+01 7.000e+00 2.080e+02 6.600e+01
 7.900e+01 3.800e+01 2.020e+02 1.490e+02 4.000e+01 1.600e+01 1.050e+02
 1.670e+02 4.490e+02 2.100e+01 5.200e+01 2.200e+01 3.200e+01 8.400e+01
 3.900e+01 1.840e+02 4.400e+01 2.000e+01 2.000e+00 6.300e+01 1.310e+02
 6.740e+02 8.500e+01 5.100e+01 2.260e+02 2.870e+02 5.800e+02 1.000e+01
 6.700e+01 7.800e+01 5.000e+00 5.500e+01 1.400e+01 1.000e+02 9.000e+01
 1.120e+02 1.720e+02 3.270e+02 4.600e+01 8.300e+01 3.540e+02 9.670e+02
 1.240e+02 1.030e+02 7.100e+01 9.000e+00 3.190e+02 6.000e+00 2.280e+02
 1.300e+01 4.700e+01 8.000e+01 1.580e+02 1.200e+01 4.000e+00 4.800e+01
 5.600e+01 4.100e+02 4.300e+01 2.900e+01 8.700e+01 3.100e+01 2.054e+03
 2.480e+02 1.900e+01 2.500e+01 2.300e+01 1.260e+02 1.510e+02 2.700e+01
 4.100e+01 9.300e+01 6.900e+01 9.200e+01 1.190e+02 3.6

In [115]:
# Ottieni i nomi delle colonne
columns_to_check = df.columns[11:12]

# Crea un dizionario per memorizzare le istanze univoche per ogni colonna
unique_instances = {}

# Itera attraverso le colonne e ottieni le istanze univoche
for col in columns_to_check:
    unique_instances[col] = df[col].unique()

# Stampa le istanze univoche per ogni colonna
for col, unique_vals in unique_instances.items():
    print(f"Colonna: {col}")
    print(unique_vals)
    print()

Colonna: price(in EUR)
[ 692.89   412.5    549.89   373.89   208.89   956.89   263.89   373.989
  428.89   296.89   736.89   417.89   626.89   439.89   516.89   540.353
  527.89   951.5    494.89   582.89   372.79   423.5    504.999  450.45
 1099.89   659.89   638.198  425.689  362.89   593.89   714.89  1209.89
  749.254  616.539  285.89   517.429  747.89   450.89  1759.89   824.89
  681.89   637.89   571.89   197.89   175.89   472.89   604.89   340.89
  560.89   879.89   251.9    371.261  505.89   687.489  846.989  973.489
  725.89   945.89  1539.89  1248.39   989.89   461.89   703.89   978.89
  846.89  1154.89   868.89   758.89  1055.89   675.4    467.39   487.19
 1319.89  1429.89  1781.89   813.89  3387.89   307.89   682.55   453.09
  615.89   352.     977.9    802.89   381.59  2352.834  395.89   860.09
  478.39   406.89  1341.89   967.89   471.9    973.39  1550.89   384.89
  418.319  351.89   548.79   663.19   857.89  1039.5   1704.89  1264.89
  890.89   434.39  3101.89   347.171  