# Next24 Technology
## Data Analyst
## Task 3: E-commerce Product Recommendation System
## Arti Gupta


# Description of the Dataset
It includes 5 variables.These variables are described below:

 1. id: A unique identifier for each record in the dataset.
 2. product_uid: This ID is associated with specific products listed on the platform.
 3. product_title: The title or name of the product.
 4. search_term:The search query entered by the user.
 5. relevance: A relevance score indicating how well the product matches the search term. 

# Goal of the Project
The goal of this project is to build an E-commerce Product Recommendation System. The purpose of this recommendation engine is to suggest personalized product recommendations to users based on their search behavior and other interactions on the e-commerce platform.

In [4]:
import pandas as pd
import numpy as np

# Load the Dataset

In [5]:
df = pd.read_csv("/kaggle/input/recommendation/recommendation.csv",encoding='ISO-8859-1')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74067 entries, 0 to 74066
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             74067 non-null  int64  
 1   product_uid    74067 non-null  int64  
 2   product_title  74067 non-null  object 
 3   search_term    74067 non-null  object 
 4   relevance      74067 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 2.8+ MB


In [7]:
df.isnull().sum()

id               0
product_uid      0
product_title    0
search_term      0
relevance        0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,id,product_uid,relevance
count,74067.0,74067.0,74067.0
mean,112385.709223,142331.911553,2.381634
std,64016.57365,30770.774864,0.533984
min,2.0,100001.0,1.0
25%,57163.5,115128.5,2.0
50%,113228.0,137334.0,2.33
75%,168275.5,166883.5,3.0
max,221473.0,206650.0,3.0


In [9]:
df

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.00
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.50
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.00
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67
...,...,...,...,...,...
74062,221457,206638,Atlantic Windowpane 576 CD or 192 DVD Blu-Ray ...,tv riser glass,1.00
74063,221458,206639,Philips 40-Watt Halogen R20 Flood Light Bulb (...,r20 halogen light,3.00
74064,221463,206641,Schlage Camelot In-Active Aged Bronze Handlese...,schlage lock siena half dummy knob with,2.33
74065,221471,206648,Plastec 11 in. x 24 in. Rose Garden Wall Decor...,zen garden decor,3.00


In [10]:
df.shape

(74067, 5)

In [11]:
df["product_title"].value_counts()

product_title
Lithonia Lighting All Season 4 ft. 2-Light Grey T8 Strip Fluorescent Shop Light                                          21
Pressure-Treated Timber #2 Southern Yellow Pine (Common: 4 in. x 4 in. x 8 ft.; Actual: 3.56 in. x 3.56 in. x 96 in.)    21
2 in. x 4 in. x 96 in. Premium Kiln-Dried Whitewood Stud                                                                 18
Custom Building Products VersaBond Gray 50 lb. Fortified Thin-Set Mortar                                                 17
Ryobi ONE+ 18-Volt Lithium-Ion Cordless Drill/Driver and Impact Driver Kit (2-Tool)                                      17
                                                                                                                         ..
Con-Tact Creative Covering 18 in. x 24 ft. Granite Multipurpose Shelf Liner, 6 Per Pack                                   1
IDEAL Security Garage Door Safety Cables (2-Pack)                                                                     

In [12]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
import nltk
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [13]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['product_title'] = df['product_title'].apply(preprocess_text)
df['search_term'] = df['search_term'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
df[["search_term","product_title"]]

Unnamed: 0,search_term,product_title
0,angl bracket,simpson strong-ti 12-gaug angl
1,l bracket,simpson strong-ti 12-gaug angl
2,deck,behr premium textur deckov 1-gal. #sc-141 tugb...
3,rain shower head,delta vero 1-handl shower faucet trim kit chro...
4,shower faucet,delta vero 1-handl shower faucet trim kit chro...
...,...,...
74062,tv riser glass,atlant windowpan 576 cd 192 dvd blu-ray game m...
74063,r20 halogen light,philip 40-watt halogen r20 flood light bulb (1...
74064,schlage lock siena half dummi knob,schlage camelot in-act age bronz handleset lef...
74065,zen garden decor,plastec 11 in. x 24 in. rose garden wall decor...


In [15]:
# Train Word2Vec model
w2v_model = Word2Vec(sentences=df['product_title'].tolist() + df['search_term'].tolist(), vector_size=100, window=5, min_count=1, workers=4)

In [18]:
def get_avg_word2vec(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return [0] * vector_size
    return np.mean(vectors, axis=0)

df['product_title'] = df['product_title'].apply(lambda x: get_avg_word2vec(x, w2v_model, 100))
df['search_term'] = df['search_term'].apply(lambda x: get_avg_word2vec(x, w2v_model, 100))


In [20]:
df['combined_features'] = df.apply(lambda x: x['product_title'] + x['search_term'], axis=1)


In [21]:
x = pd.DataFrame(df['combined_features'].tolist())
y = df['relevance']

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [23]:
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
9558,0.240725,-0.020447,0.026145,0.419725,-0.140946,-0.018750,0.604147,-0.247286,-0.221174,-0.064689,...,-0.281688,-0.404865,0.175736,-0.151630,0.162907,0.099145,-0.057786,0.059900,0.158184,0.031869
35894,0.217974,0.075786,0.009554,0.498836,-0.347510,-0.067776,0.650645,-0.338188,-0.146677,-0.060369,...,-0.220396,-0.274217,0.007262,-0.048489,0.167759,0.266011,-0.058468,-0.163944,0.224356,0.188719
47152,0.541874,0.599247,0.565298,1.496762,-0.605328,-0.960368,0.962538,-0.018066,-0.340479,0.082298,...,-0.965731,-0.489256,0.169695,-1.199982,0.729075,0.324044,0.129125,0.029952,0.326338,1.225768
37545,0.520136,-0.089263,0.137114,0.424368,0.011843,0.193087,0.364267,-0.240751,-0.081578,-0.095955,...,-0.119733,-0.354986,0.103930,0.196998,0.189558,0.206632,-0.413354,-0.135289,0.079998,0.010418
9548,0.426314,-0.343330,0.023209,0.326426,-0.111864,0.130467,0.370927,-0.107079,-0.128394,-0.241619,...,-0.210412,-0.387611,0.096750,0.167616,0.054713,0.293424,-0.350400,-0.408843,0.055688,0.047956
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37194,0.347858,-0.033246,-0.099464,0.491577,-0.060838,0.025191,0.525625,-0.268064,-0.136552,-0.067861,...,-0.272922,-0.312547,0.045399,0.131525,0.195984,0.234092,-0.188982,-0.201468,0.192405,0.135968
6265,0.284751,0.149775,0.117719,0.615242,-0.079216,-0.044265,0.491153,-0.145957,-0.160882,-0.067659,...,-0.246629,-0.420237,0.021610,-0.068738,0.197555,0.387766,-0.420131,-0.190022,-0.082274,0.223058
54886,0.320173,-0.201390,-0.165704,0.358504,0.098345,0.246419,0.464988,-0.172609,0.005041,-0.139270,...,0.006645,-0.335409,0.078840,0.289310,0.071750,0.295422,-0.424577,-0.277194,0.050513,-0.129904
860,0.295149,0.008632,0.062520,0.350078,-0.082020,0.112299,0.429389,-0.101355,-0.125956,-0.103019,...,-0.115867,-0.241121,0.033284,0.218100,0.344008,0.351764,-0.312395,-0.227636,0.080559,0.156232


In [24]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

In [25]:
predictions = model.predict(x_test)

In [28]:
def recommend_products(search_term, df, model, w2v_model, top_n=5):
    search_term_tokens = preprocess_text(search_term)
    search_term_vec = get_avg_word2vec(search_term_tokens, w2v_model, 100)
    df['predicted_relevance'] = model.predict(pd.DataFrame(df['product_title'].tolist()) + search_term_vec)
    recommendations = df.sort_values(by='predicted_relevance', ascending=False).head(top_n)
    return recommendations[['product_uid', 'product_title', 'predicted_relevance']]

In [29]:
recommendations = recommend_products("angl bracket", df, model, w2v_model)
print(recommendations)

      product_uid                                      product_title  \
2624       101370  [0.1366478, 0.04278717, 0.021695744, 0.2783467...   
5818       103250  [0.12955265, 0.13811608, 0.12461213, 0.4573598...   
0          100001  [0.09494834, 0.0019198467, 0.061701015, 0.2859...   
1          100001  [0.09494834, 0.0019198467, 0.061701015, 0.2859...   
1212       100664  [0.09494834, 0.0019198467, 0.061701015, 0.2859...   

      predicted_relevance  
2624             2.809900  
5818             2.786400  
0                2.764185  
1                2.764185  
1212             2.764185  
