In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_colwidth', None)

> Preprocessing

# Key Points

1. Tags include:
  - brand
  - processor_brand           
  - processor_tier            
  - num_cores                   
  - ram_memory                  
  - primary_storage_type      
  - primary_storage_capacity    
  - gpu_brand                 
  - gpu_type                  
  - display_size   
2. Price will be used as a filter for the initial preprocessing and will be excluded from the tags. Will add in later stages.        

In [3]:
laptops = pd.read_csv("data.csv")

In [4]:
laptops.rename(columns={"Model": "model", "Price": "price", "Rating": "rating"}, inplace=True)


In [5]:
laptops.drop(columns=["secondary_storage_type"], inplace=True)


In [6]:
laptops.sort_values(by=["rating", "price"], ascending=[False, True], inplace=True)

In [7]:
laptops = laptops.astype(str)

In [8]:
laptops["tags"] = laptops[["brand", "processor_brand", "processor_tier", "num_cores", "ram_memory", "primary_storage_type",
                           "primary_storage_capacity", "gpu_brand", "gpu_type", "display_size"]].apply(lambda x: ' '.join(x), axis = 1)

In [9]:
laptops["id"] = laptops.index

In [10]:
new = laptops[["id", "model", "tags"]]
new.index = range(len(new))

> Vectorization

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [12]:
vector = cv.fit_transform(new["tags"]).toarray()

In [13]:
vector.shape

(991, 65)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
similarity = cosine_similarity(vector)

In [16]:
similarity

array([[1.        , 0.46225016, 0.83624201, ..., 0.        , 0.        ,
        0.        ],
       [0.46225016, 1.        , 0.70352647, ..., 0.        , 0.        ,
        0.        ],
       [0.83624201, 0.70352647, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.83333333],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.83333333],
       [0.        , 0.        , 0.        , ..., 0.83333333, 0.83333333,
        1.        ]])

In [27]:
def recommend(tags):
  new_df = new.copy()
  for tag in tags:
    filt = new_df["tags"].str.contains(tag)
    new_df = new_df[filt]
  index = new_df[new_df["model"] == new_df["model"].values[0]].index[0]
  distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
  for i in distances[1:6]:
    print(new.iloc[i[0]].model)

recommend(["core i9"])

Asus Vivobook Pro 16 OLED 2023 K6602VU-LZ952WS Laptop (13th Gen Core i9/ 16GB/ 1TB SSD/ Win11/ 6GB Graph)
Asus Vivobook 16X 2023 K3605VC-MB951WS Laptop (13th Gen Core i9/ 16GB/ 1TB SSD/ Win11 Home/ 4GB Graph)
Lenovo IdeaPad Pro 5 83AQ005SIN Gaming Laptop (13th Gen Core i7/ 16GB/ 1TB SSD/ Win11/ 6GB Graph)
Lenovo Legion Pro 5 16IRX8 82WK00MWIN Gaming Laptop (13th Gen Core i7/ 16GB/ 1TB SSD/ Win11/ 8GB Graph)
Asus ROG Zephyrus G16 2023 GU603ZU-N3023WS Laptop (12th Gen Core i7/ 16GB/ 1TB SSD/ Win11 Home/ 6GB Graph)


> Testing

In [18]:
# tags = []

In [19]:
tagNames = [
    "brand",
    "processor_brand",
    "processor_tier",
    "num_cores",
    "ram_memory",
    "primary_storage_type",
    "primary_storage_capacity",
    "gpu_brand",
    "gpu_type",
    "display_size"
]

# for i in range(10):
#   tags.append(input("Enter " + tagNames[i] + ": ").strip())
# tags = list(filter(lambda x: bool(len(x)), list(pd.unique(tags))))
# recommend(tags)

> Generating Unique Tags for Front-end Search

In [20]:
# for tag in tagNames:
#   with open(tag + ".txt", "w") as f:
#     f.write("\n".join(laptops[tag].unique().tolist()))