In [1]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer



In [2]:
# Download NLTK resources (only required once)
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:

data = pd.read_csv('amazon_co-ecommerce_sample.csv') 

print(data.head())

                            uniq_id  \
0  eac7efa5dbd3d667f26eb3d3ab504464   
1  b17540ef7e86e461d37f3ae58b7b72ac   
2  348f344247b0c1a935b1223072ef9d8a   
3  e12b92dbb8eaee78b22965d2a9bbbd9f   
4  e33a9adeed5f36840ccc227db4682a36   

                                        product_name manufacturer   price  \
0                              Hornby 2014 Catalogue       Hornby   £3.42   
1  FunkyBuys® Large Christmas Holiday Express Fes...    FunkyBuys  £16.99   
2  CLASSIC TOY TRAIN SET TRACK CARRIAGES LIGHT EN...          ccf   £9.99   
3     HORNBY Coach R4410A BR Hawksworth Corridor 3rd       Hornby  £39.99   
4  Hornby 00 Gauge 0-4-0 Gildenlow Salt Co. Steam...       Hornby  £32.19   

  number_available_in_stock number_of_reviews  number_of_answered_questions  \
0                     5 new                15                           1.0   
1                       NaN                 2                           1.0   
2                     2 new                17                    

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   uniq_id                                      10000 non-null  object 
 1   product_name                                 10000 non-null  object 
 2   manufacturer                                 9993 non-null   object 
 3   price                                        8565 non-null   object 
 4   number_available_in_stock                    7500 non-null   object 
 5   number_of_reviews                            9982 non-null   object 
 6   number_of_answered_questions                 9235 non-null   float64
 7   average_review_rating                        9982 non-null   object 
 8   amazon_category_and_sub_category             9310 non-null   object 
 9   customers_who_bought_this_item_also_bought   8938 non-null   object 
 10 

In [5]:
print(data.columns)

Index(['uniq_id', 'product_name', 'manufacturer', 'price',
       'number_available_in_stock', 'number_of_reviews',
       'number_of_answered_questions', 'average_review_rating',
       'amazon_category_and_sub_category',
       'customers_who_bought_this_item_also_bought', 'description',
       'product_information', 'product_description',
       'items_customers_buy_after_viewing_this_item',
       'customer_questions_and_answers', 'customer_reviews', 'sellers'],
      dtype='object')


In [6]:
def preprocess_text(text):
    
    if pd.isna(text):
        return ""
    
    text = text.lower()
   
    tokens = word_tokenize(text)
   
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    ps = PorterStemmer()
    stemmed_tokens = [ps.stem(word) for word in tokens]
   
    preprocessed_text = ' '.join(stemmed_tokens)
    return preprocessed_text


In [7]:
data['cleaned_description'] = data['product_description'].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_description'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)



In [8]:
def get_recommendations(product_name, cosine_sim=cosine_sim):
    idx = data[data['product_name'] == product_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    product_indices = [i[0] for i in sim_scores]
    return data['product_name'].iloc[product_indices]

product_name = "Hornby 2014 Catalogue"
recommendations = get_recommendations(product_name)
print(recommendations)

64      Hornby R3064 RailRoad BR Smokey Joe 00 Gauge S...
5557    Magic The Gathering MTG Holiday Gift Box 2014 ...
5707                                Hornby - Track Rubber
5722    Hornby R2675 00 Gauge LNER Flying Scotsman Rai...
3339    Rubie's - Toy Story - I-883770S -Buzz Lightyea...
9772     Hama - 205-67 - Bag of 6000 Beads 22 Colours Mix
52                  Hornby Digital 15 V 4 Amp Transformer
2150    Hot Wheels Monster Jam HW OFF Road Series Brut...
5701    Jadlam Racing New HORNBY TRACK R607 8 x DOUBLE...
4851            Disney Frozen Magical Sticker Book Scenes
Name: product_name, dtype: object


In [9]:
def get_recommendations_for_keyword(keyword, data):
    recommendations = []
    keyword_tokens = word_tokenize(keyword.lower())
    print("Keyword tokens:", keyword_tokens)
    for index, row in data.iterrows():
        product_name_tokens = word_tokenize(row['product_name'].lower())
        print("Product name tokens:", product_name_tokens)
        if all(word in product_name_tokens for word in keyword_tokens):
            recommendations.append(row['product_name'])
    return recommendations


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel



In [11]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['product_name'])



In [13]:
# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)




In [14]:
def get_recommendations_for_keyword(keyword, data, cosine_sim):
    keyword_vector = tfidf_vectorizer.transform([keyword])
    sim_scores = linear_kernel(keyword_vector, tfidf_matrix).flatten()
    sim_indices = sim_scores.argsort()[::-1]
    recommendations = [data.iloc[idx]['product_name'] for idx in sim_indices]
    return recommendations



In [15]:

def get_recommendations_from_user_input():
    
    user_input = input("Enter a product name: ")
    recommendations = get_recommendations_for_keyword(user_input, data, cosine_sim)
    print("Similar products for '{}' are:".format(user_input))
    for recommendation in recommendations:
        print(recommendation)


get_recommendations_from_user_input()

Enter a product name: train
Similar products for 'train' are:
SL Train (4-Car Set) (Model Train)
Dinosaur Train Conductor with Train Car
Commuter Train Series 103 (Blue) (3-Car Set) (Model Train)
Commuter Train Series 103 (Orange) (3-Car Set) (Model Train)
Plarail - S-24 Series 485 Limited Express Train (Model Train)
Kato N Gauge Train Set Case (Kato PlaRail Model Train) [Toy]
Mexican Train Accessory Set
Toy Train Set High Speed EMU Toy Train with Flashing Light & Music Kids Toys Gift
B Train Shorty DD51 Diesel Locomotive Renewal Car A(Blue) & B(Red) (2-Car Set) (Model Train)
CAT Construction Express Train
Hornby Santa's Express Train Set
Hornby Gauge Western Express Digital Train Set with eLink and TTS Loco Train Set
Plan Toys Cargo Train
Plan Toys Modern Train
Melissa & Doug Passenger Train
Rail King Intelligent Classical Train Track Set With Light ,Smoke & Train Sound Huge Size XMAS GIFT
Plan Toys Fuel Train
Bigjigs Rail Freight Train Set
Carousel Wooden Train Set (100 Piece)
Track 