In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.max_colwidth', None)

> Preprocessing

# Key Points

1. Tags include:
  - brand
  - processor_brand           
  - processor_tier            
  - num_cores                   
  - ram_memory                  
  - primary_storage_type      
  - primary_storage_capacity    
  - gpu_brand                 
  - gpu_type                  
  - display_size   
2. Price will be used as a filter for the initial preprocessing and will be excluded from the tags. Will add in later stages.        

In [None]:
laptops = pd.read_csv("data-cleaned.csv")

In [None]:
laptops.rename(columns={"Model": "model", "Price": "price", "Rating": "rating"}, inplace=True)

In [None]:
laptops.drop(columns=["secondary_storage_type"], inplace=True)

In [None]:
laptops.sort_values(by=["rating"], ascending=[False], inplace=True)

In [None]:
laptops = laptops.astype(str)

In [None]:
laptops["num_cores"] = laptops["num_cores"].apply(lambda x: x+"cores")

In [None]:
laptops["ram_memory"] = laptops["ram_memory"].apply(lambda x: x+"GB-RAM")

In [None]:
laptops["primary_storage_capacity"] = laptops["primary_storage_capacity"].apply(lambda x: x+"GB")

In [None]:
laptops["display_size"] = laptops["display_size"].apply(lambda x: x+"''")

In [None]:
laptops["tags"] = laptops[["brand", "processor_brand", "processor_tier", "num_cores", "ram_memory", "primary_storage_type",
                           "primary_storage_capacity", "gpu_brand", "gpu_type", "display_size"]].apply(lambda x: ' '.join(x), axis = 1)

In [None]:
laptops["id"] = laptops.index

In [None]:
new = laptops[["id", "model", "tags"]]
new.index = range(len(new))

> Vectorization (Bag of Words)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [None]:
vector = cv.fit_transform(new["tags"]).toarray()

In [None]:
vector.shape

(991, 81)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vector)

In [None]:
similarity

array([[1.        , 0.52223297, 0.52223297, ..., 0.09622504, 0.09622504,
        0.09622504],
       [0.52223297, 1.        , 0.63636364, ..., 0.10050378, 0.10050378,
        0.10050378],
       [0.52223297, 0.63636364, 1.        , ..., 0.20100756, 0.20100756,
        0.20100756],
       ...,
       [0.09622504, 0.10050378, 0.20100756, ..., 1.        , 1.        ,
        0.88888889],
       [0.09622504, 0.10050378, 0.20100756, ..., 1.        , 1.        ,
        0.88888889],
       [0.09622504, 0.10050378, 0.20100756, ..., 0.88888889, 0.88888889,
        1.        ]])

> Vectorization (TF-IDF)

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000,stop_words='english')

In [113]:
matrix = tfidf.fit_transform(new["tags"])

In [120]:
matrix.shape

(991, 81)

In [115]:
from sklearn.metrics.pairwise import linear_kernel
# from sklearn.metrics.pairwise import sigmoid_kernel

In [116]:
similarity = linear_kernel(matrix, matrix)
# similarity = sigmoid_kernel(matrix, matrix)

In [117]:
similarity

array([[1.        , 0.36417391, 0.43270721, ..., 0.00701301, 0.00701301,
        0.00691731],
       [0.36417391, 1.        , 0.48086605, ..., 0.00694693, 0.00694693,
        0.00685213],
       [0.43270721, 0.48086605, 1.        , ..., 0.07159633, 0.07159633,
        0.07061928],
       ...,
       [0.00701301, 0.00694693, 0.07159633, ..., 1.        , 1.        ,
        0.76253782],
       [0.00701301, 0.00694693, 0.07159633, ..., 1.        , 1.        ,
        0.76253782],
       [0.00691731, 0.00685213, 0.07061928, ..., 0.76253782, 0.76253782,
        1.        ]])

> Recommendation

In [118]:
def recommend(tags):
  new_df = new.copy()
  for tag in tags:
    filt = new_df["tags"].str.contains(tag)
    new_df = new_df[filt]
  index = new_df[new_df["model"] == new_df["model"].values[0]].index[0]
  distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
  for i in distances[1:6]:
    print(new.iloc[i[0]].model)

recommend(["lenovo"])

Asus Vivobook Pro 16 OLED 2023 K6602VU-LZ952WS Laptop (13th Gen Core i9/ 16GB/ 1TB SSD/ Win11/ 6GB Graph)
Asus Vivobook 16X 2023 K3605VC-MB951WS Laptop (13th Gen Core i9/ 16GB/ 1TB SSD/ Win11 Home/ 4GB Graph)
Asus Vivobook Pro 15 OLED K6502VU-MA951WS Laptop (13th Gen Core i9/ 16GB/ 1TB SSD/ Win11 Home/ 6GB Graph)
Asus TUF Gaming F15 2023 FX507VV-LP071WS Gaming Laptop (13th Gen Core i9/ 16GB/1TB SSD/ Win11 Home/ 8GB Graph)
Asus Vivobook 16 2023 X1605VA-MB957WS Laptop (13th Gen Core i9/ 16GB/ 1TB SSD/ Win11 Home)


> Testing

In [None]:
# tags = []

In [None]:
tagNames = [
    "brand",
    "processor_brand",
    "processor_tier",
    "num_cores",
    "ram_memory",
    "primary_storage_type",
    "primary_storage_capacity",
    "gpu_brand",
    "gpu_type",
    "display_size"
]

In [None]:
# for i in range(10):
#   tags.append(input("Enter " + tagNames[i] + ": ").strip())

In [None]:
# tags = list(filter(lambda x: bool(len(x)), list(pd.unique(tags))))

In [None]:
# recommend(tags)

> Generating Unique Tags for Front-end Search

In [None]:
# for tag in tagNames:
#   with open(tag + ".txt", "w") as f:
#     f.write("\n".join(laptops[tag].unique().tolist()))