This cell will load and define the initial data.

In [22]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.neighbors import NearestNeighbors
from difflib import get_close_matches

In [23]:
# Step 1: Prepare the DataFrame with categorical CPU and GPU values
data_duy = pd.read_csv('laptop_duy_updated.csv',usecols=['Laptop_name','Price(VND)','GPU','CPU','RAM(GB)','Storage(GB)','Screen_size(inches)'])
data_huyen = pd.read_csv('clean_laptop.csv',usecols=['Laptop_name','Price(VND)','GPU','CPU','RAM(GB)','Storage(GB)','Screen_size(inches)'])

# Concatenate DataFrames vertically
data = pd.concat([data_duy, data_huyen])
# Save the DataFrame to a CSV file
data.to_csv('alldata.csv', index=False)

In [24]:
data.head(10)

Unnamed: 0,Laptop_name,Price(VND),GPU,CPU,RAM(GB),Storage(GB),Screen_size(inches)
0,Laptop HP Gaming Victus 16-E0170AX 4R0U7PA,22990000.0,NVIDIA GeForce RTX 3050,AMD Ryzen 7 5800H,8.0,512.0,16.1
1,Laptop Lenovo Yoga 7 2-in-1 14IML9 83DJ001FVN,28990000.0,INTEL Arc Graphics,Intel Core Ultra 7 155H,16.0,512.0,14.0
2,Laptop Lenovo Yoga Pro 7 14ASP9 83HN0022VN,40990000.0,AMD Radeon 880M Graphics,AMD Ryzen AI 9 365,32.0,1024.0,14.5
3,Laptop Lenovo Yoga Pro 7 14IMH9 83E2005DVN,43490000.0,NVIDIA GeForce RTX 4050,Intel Core Ultra 7 155H,32.0,1024.0,14.5
4,Laptop Lenovo LOQ 15IRX9 83DV00UGVN,30490000.0,NVIDIA GeForce RTX 4050,Intel Core i7 13650HX,24.0,512.0,15.6
5,Laptop Lenovo Ideapad 5 14ABA7 82SE007EVN,17490000.0,AMD Radeon Graphics,AMD Ryzen 7 5825U,16.0,512.0,14.0
6,Laptop ASUS Gaming ROG Strix SCAR 16 G634JZR-...,105990000.0,NVIDIA GeForce RTX 4080,Intel Raptor Lake i9 14900HX,64.0,2048.0,16.0
7,Laptop ASUS ExpertBook B1 B1402CVA-NK0104W,10690000.0,INTEL UHD Graphics,Intel Core i3 1315U,8.0,256.0,14.0
8,Laptop ASUS ExpertBook B1 B1502CVA-NJ0149W,10990000.0,INTEL UHD Graphics,Intel Core i3 1315U,8.0,512.0,15.6
9,Laptop Asus Zenbook 14 UX3402ZA-KM219W,23690000.0,INTEL Iris Xe Graphics,Intel Core i5 1240P,16.0,512.0,14.0


In [25]:
# Convert all values to lowercase
data = data.apply(lambda col: col.map(lambda x: x.lower() if isinstance(x, str) else x))
# Set 'laptop_name' as the index column
data.set_index('Laptop_name', inplace=True)

Label Encoding

In [26]:
# Initialize label encoders
cpu_encoder = LabelEncoder()
gpu_encoder = LabelEncoder()

# Encode CPU and GPU columns
data['CPU'] = cpu_encoder.fit_transform(data['CPU'])
data['GPU'] = gpu_encoder.fit_transform(data['GPU'])

Data Preprocessing

In [27]:
# Step 1: Preprocess the data

features = ['CPU', 'GPU', 'RAM(GB)', 'Storage(GB)', 'Screen_size(inches)','Price(VND)']

# Drop rows with missing values in relevant features
data = data.dropna(subset=features)

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[features])


Model Initialization

In [28]:
k = 7  # Adjustable 'k' value
knn = NearestNeighbors(n_neighbors=k, metric='euclidean')
knn.fit(data_scaled)


User Input

In [29]:
def get_user_preferences():
    print("Enter your preferences for each feature:")


    # Collect CPU model name and handle unseen values
    cpu_name = input("CPU (e.g., amd ryzen 7 6800h): ")
    cpu = cpu_encoder.transform([cpu_name])[0]

    if cpu_name not in cpu_encoder.classes_:
        # Suggest the closest match
        closest_cpu = get_close_matches(cpu_name, cpu_encoder.classes_, n=1)
        if closest_cpu:
            print(f"'{cpu_name}' not found. Did you mean '{closest_cpu[0]}'? Using '{closest_cpu[0]}' as fallback.")
            cpu_name = closest_cpu[0]
        else:
            print(f"Error: '{cpu_name}' not found and no close matches are available.")
            return None, None, None  # Stop and prompt the user to try again
    
    # Collect GPU model name and handle unseen values
    gpu_name = input("Graphic Card (e.g., nvidia geforce rtx 3050): ")
    gpu = gpu_encoder.transform([gpu_name])[0]

    if gpu_name not in gpu_encoder.classes_:
        # Suggest the closest match
        closest_gpu = get_close_matches(gpu_name, gpu_encoder.classes_, n=1)
        if closest_gpu:
            print(f"'{gpu_name}' not found. Did you mean '{closest_gpu[0]}'? Using '{closest_gpu[0]}' as fallback.")
            gpu_name = closest_gpu[0]
        else:
            print(f"Error: '{gpu_name}' not found and no close matches are available.")
            return None, None, None  # Stop and prompt the user to try again


    
    # Collecting other preferences as before
    ram = int(input("RAM (e.g., 16 for 16 GB): "))
    storage = int(input("Storage (e.g., 512 for 512 GB): "))
    screen_size = float(input("Screen Size (e.g., 13.4 for 13.4 inch): "))
    min_price = float(input("Minimum Price (e.g., 6000000 for 6 mill(VND)): "))
    max_price = float(input("Maximum Price (e.g., 100000000 for 100 mill(VND)): "))
    
    return [[cpu, gpu, ram, storage, screen_size,(min_price+max_price)/2]], min_price, max_price


Recommendation Logic

In [30]:
def recommend_laptops(user_input):
    # Transform user input with the same scaler
    user_input_scaled = scaler.transform(user_input)
    # Find nearest neighbors
    distances, indices = knn.kneighbors(user_input_scaled)
    # Retrieve recommendations
    recommended_laptops = data.iloc[indices[0]].copy()
    recommended_laptops['Distance'] = distances[0]
    # Filter based on price range
    recommended_laptops = recommended_laptops[
        (recommended_laptops['Price'] >= min_price) & (recommended_laptops['Price'] <= max_price)
    ]
    return recommended_laptops


Execution

In [33]:
# Retrieve user preferences
user_input, min_price, max_price = get_user_preferences()
if user_input is None:
    print("User input was invalid.")
else:
    # Filter the dataset based on price range
    data_filtered = data[(data['Price(VND)'] >= min_price) & (data['Price(VND)'] <= max_price)]
    if data_filtered.empty:
        print("No laptops found within the specified price range.")
    else:
        # Scale the filtered data
        data_filtered_scaled = scaler.transform(data_filtered[features])

        # Transform and scale the user input
        user_input_scaled = scaler.transform(user_input)

        # Find nearest neighbors
        distances, indices = knn.kneighbors(user_input_scaled)

        # Map the recommended indices back to the original DataFrame
        recommended_indices = indices[0]
        recommended_laptops = data.iloc[recommended_indices]
        recommended_laptops['Distance'] = distances[0]


        # Filter the recommendations based on the price range again
        recommended_laptops = recommended_laptops[(recommended_laptops['Price(VND)'] >= min_price) & (recommended_laptops['Price(VND)'] <= max_price)]
        # Decode the CPU and GPU labels to display the original names
        recommended_laptops['CPU'] = cpu_encoder.inverse_transform(recommended_laptops['CPU'])
        recommended_laptops['GPU'] = gpu_encoder.inverse_transform(recommended_laptops['GPU'])
        
        print("Recommended Laptops with Full Specifications:\n", recommended_laptops[['CPU', 'GPU', 'RAM(GB)', 'Storage(GB)', 'Screen_size(inches)', 'Price(VND)','Distance']])
        

Enter your preferences for each feature:


Recommended Laptops with Full Specifications:
                                                                       CPU  \
Laptop_name                                                                 
laptop gaming hp omen 16-n0085ax                       amd ryzen 9 6900hx   
laptop ai hp omen 16-xf0071ax - 8w946pa                amd ryzen 7 7840hs   
laptop gaming hp omen 16-xf0070ax - 8w945pa            amd ryzen 9 7940hs   
 macbook pro 16 inch m1 max 10 cpu - 32 gpu 32g...           apple m1 max   
laptop ai hp victus 16-s0138ax - 9q985pa               amd ryzen 7 7840hs   
laptop gaming hp victus 16-s0142ax - 9q989pa           amd ryzen 5 7640hs   
laptop asus rog zephyrus g16 ga605wi 2024           amd ryzen ai 9 hx 370   

                                                                          GPU  \
Laptop_name                                                                     
laptop gaming hp omen 16-n0085ax                    nvidia geforce rtx 3070ti   
laptop ai hp ome

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_laptops['Distance'] = distances[0]


METRICS

Precision at K

In [38]:
# Precision at K (here, K is the number of neighbors specified in NearestNeighbors)
precision_at_k = (recommended_laptops['Price(VND)'] >= min_price) & (recommended_laptops['Price(VND)'] <= max_price)
precision_at_k = precision_at_k.sum() / len(recommended_laptops)
print(f"Precision at K: {precision_at_k:.2f}")

Precision at K: 1.00


Intra-cluster and Inter-cluster Distances

In [35]:
# Calculate the mean intra-cluster distance (average distance within the recommended laptops)
mean_intra_distance = distances[0].mean()
print("Mean Intra-cluster Distance:", mean_intra_distance)

# Calculate inter-cluster distance as average distance from the user input to non-recommended laptops
non_recommended_indices = [i for i in range(len(data)) if i not in recommended_indices]
non_recommended_distances = knn.kneighbors(user_input_scaled, n_neighbors=len(non_recommended_indices))[0]
mean_inter_distance = non_recommended_distances.mean()
print("Mean Inter-cluster Distance:", mean_inter_distance)


Mean Intra-cluster Distance: 1.0431447642187437
Mean Inter-cluster Distance: 3.2394509367968154


Mean Squared Error (MSE) for Distances between User input and the system output

In [39]:
from sklearn.metrics import root_mean_squared_error

# Calculate the MSE for distances
mse_distance = root_mean_squared_error([0] * len(distances[0]), distances[0])
print("RMSE for Distances:", mse_distance)


RMSE for Distances: 1.1009940938051528
