In [1]:
import pandas as pd
import numpy as np

In [2]:
laptop = pd.read_csv('laptops.csv')

In [None]:
laptop.head()


In [None]:
laptop.info()

In [None]:
# Essential data
# brand 
# processor brand
# processor name
# processor generation 
# ram_gb
# ssd
# hdd 
# os
# weight
# display_size
# latest_prcie
# star_rating

In [3]:
laptop = laptop[['model','brand','processor_brand','processor_gnrtn','processor_name','ram_gb','ssd','hdd','os','weight','display_size','latest_price','star_rating']]

In [None]:
laptop.isnull().sum()

In [None]:
laptop.duplicated().sum()

In [4]:
laptop = laptop.drop(laptop[(laptop.processor_gnrtn=='Missing')].index)

In [5]:
laptop = laptop.drop_duplicates()

In [6]:
laptop = laptop.drop(laptop[(laptop.display_size=='Missing')].index)

In [None]:
laptop.duplicated().sum()

In [None]:
laptop.head()

In [8]:
laptop = laptop.drop(laptop[(laptop.model=='Missing')].index)

In [None]:
from google.colab import data_table
from vega_datasets import data
data_table.DataTable(laptop, include_index=False, num_rows_per_page=397)

In [None]:
laptop.head()

In [9]:
#Convert the column name ssd and hdd into the row names for better tag lists.
for i in laptop.index:
  if int(list(laptop.loc[i,'ssd'].split(" "))[0]) > 0:
    laptop.loc[i,'storage_type'] = 'SSD'
for i in laptop.index:
  if int(list(laptop.loc[i,'hdd'].split(" "))[0]) > 0:
    laptop.loc[i,'storage_type'] = 'HDD'
for i in laptop.index:
  laptop.loc[i,'storage_space'] = int(list(laptop.loc[i,'ssd'].split(" "))[0]) + int(list(laptop.loc[i,'hdd'].split(" "))[0])

In [10]:
for i in laptop.index:
  laptop.loc[i,'ram_gb'] = ''.join(laptop.loc[i,'ram_gb'].split(" ")[:-1])

In [11]:
laptop['ram_gb'] = laptop['ram_gb']+"GB"

In [12]:
laptop = laptop.drop(['ssd','hdd'],axis = 1)

In [None]:
laptop.head()

In [13]:
# Concatinate the words of strings in each columns. 
# This helps in effecient recommendation and helps distinction of words.
laptop['tags'] = " "
laptop['tags'] = laptop['tags'].astype('object')
for i in laptop.index :
  laptop.at[i,'tags'] = (laptop.loc[i,'model']+" "+laptop.loc[i,'brand']+" "+laptop.loc[i,'processor_brand'] + " " + laptop.loc[i,'processor_name']+" "+laptop.loc[i,'processor_gnrtn']+" "+laptop.loc[i,'ram_gb']+" "+laptop.loc[i,'storage_type']+str(laptop.loc[i,'storage_space'])+'GB '+laptop.loc[i,'os']+' '+laptop.loc[i,'display_size']+'inch '+laptop.loc[i,'weight']).split(" ")


In [14]:
new_df = laptop

In [15]:
new_df['tags'] = laptop['tags'].apply(lambda x : ' '.join(x))

In [16]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

In [None]:
new_df.head()

In [17]:
from sklearn.feature_extraction.text import CountVectorizer 
cv = CountVectorizer(max_features=400)
# We need to create an object for performing vectorizer over a column (tags).

In [18]:
# Applying steming in the words on the data.
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [19]:
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return ' '.join(y)
  

In [20]:
new_df['tags'] = new_df['tags'].apply(stem)

In [21]:
count = 0
for i in new_df.index:
  new_df.loc[i,'laptop_id'] = count
  count+=1

In [22]:
new_df.insert(0,'laptop_id',new_df.pop('laptop_id'))

In [None]:
new_df.head()

In [23]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [24]:
# We will find the similarity cosine to find the similarity between two laptops for recommendation.
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
similarity = cosine_similarity(vectors)

In [26]:
# Recommendation function:
# Input -> laptop_brand,processor_name,processor_brand,processor_gnrtn,storage_type
# Processing -> function: searching based on the user requiremenrts,recommend -> sorting the movies based on similarity with the given laptop and then choosing 10 out of them. 
# Output -> 10 similar laptops.
# for i in new_df.index:
#   new_df.loc[i,'laptop_id'] = int(new_df.loc[i,'laptop_id'])
def recommend_function(laptop_brand,laptop_model):
  laptop_index = new_df[(new_df['brand']==laptop_brand) & (new_df['model']==laptop_model)].index[0]
  t = new_df.loc[laptop_index,'laptop_id']
  # print(t)
  # index = new_df.index 
  # print(index[t==new_df['laptop_id']])
  distances = similarity[int(t)]
  laptop_list=sorted(list(enumerate(distances)),reverse = True,key = lambda x:x[1])[0:10]
  # print(laptop_list)
  # print()
  index = new_df.index
  laptop_recommendation = []
  for i in laptop_list:
      s = [ str(integer) for integer in (index[i[0]==new_df['laptop_id']].tolist())]
      p = int("".join(s))
      # laptop_recommendation.append(new_df.loc[p,'brand','model'].tolist())
      laptop_recommendation.append(new_df.loc[p,['brand','model']].tolist())
  print(laptop_recommendation)
recommend_function("ASUS","ExpertBook")
# sorted(list(similarity[0]),reverse = True,key = lambda x:x[1])[1:6]

[['ASUS', 'ExpertBook'], ['ASUS', 'VivoBook'], ['Lenovo', 'v15'], ['ASUS', 'VivoBook'], ['ASUS', 'VivoBook'], ['ASUS', 'VivoBook'], ['HP', '15q'], ['ASUS', 'VivoBook'], ['Lenovo', 'IdeaPad'], ['ASUS', 'VivoBook']]


In [28]:
set(new_df['brand'].values)

{'APPLE',
 'ASUS',
 'Avita',
 'DELL',
 'HP',
 'Infinix',
 'Lenovo',
 'MSI',
 'Mi',
 'Nokia',
 'RedmiBook',
 'Smartron',
 'acer',
 'realme'}

In [38]:
import pickle 

In [41]:
pickle.dump(new_df.to_dict(),open('laptop_dict.pkl','wb'))

{'brand': {5: 'Avita',
  6: 'HP',
  8: 'HP',
  10: 'ASUS',
  12: 'Lenovo',
  17: 'ASUS',
  26: 'HP',
  27: 'HP',
  33: 'HP',
  34: 'HP',
  36: 'Lenovo',
  37: 'ASUS',
  38: 'ASUS',
  40: 'DELL',
  41: 'RedmiBook',
  42: 'HP',
  43: 'acer',
  44: 'realme',
  46: 'ASUS',
  47: 'Lenovo',
  48: 'Lenovo',
  49: 'DELL',
  50: 'ASUS',
  51: 'Lenovo',
  52: 'DELL',
  53: 'Infinix',
  54: 'DELL',
  55: 'Lenovo',
  56: 'ASUS',
  57: 'Lenovo',
  58: 'DELL',
  60: 'Lenovo',
  61: 'DELL',
  62: 'DELL',
  63: 'DELL',
  64: 'DELL',
  65: 'Lenovo',
  67: 'ASUS',
  69: 'Lenovo',
  70: 'Lenovo',
  99: 'ASUS',
  101: 'HP',
  102: 'ASUS',
  105: 'ASUS',
  106: 'RedmiBook',
  107: 'ASUS',
  108: 'ASUS',
  109: 'ASUS',
  110: 'ASUS',
  112: 'DELL',
  117: 'HP',
  119: 'ASUS',
  120: 'Lenovo',
  121: 'ASUS',
  124: 'ASUS',
  127: 'DELL',
  128: 'DELL',
  129: 'ASUS',
  130: 'Lenovo',
  145: 'DELL',
  146: 'HP',
  147: 'ASUS',
  148: 'Lenovo',
  149: 'DELL',
  150: 'DELL',
  151: 'DELL',
  152: 'HP',
  153: '

In [44]:
pickle.dump(similarity,open('similarity.pkl','wb'))