In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [18]:
df=pd.read_excel("Assignment - ML - Supporting Data.xlsx")
df.head()

Unnamed: 0,item_id,tags,hashtags,interacted,interactions
0,64253c1f8b8de00a9072b3a6,Topwear | Men T-Shirts | white | printed | reg...,#tshirt #topwear #menswear #gandhinagar #ludhiana,No,0
1,6423c06e9ef5a81769053d77,Topwear | Men T-Shirts | maroon | cotton | pri...,#tshirt #topwear #menswear #gandhinagar #ludhiana,No,0
2,6424f5cd3a36c8270f258de8,Women Topwear | pink | regular | casual | 3/4 ...,#kidswear #ladieswear #tops #fancytops #plazzo,No,0
3,64251ffe9ef5a8176905defe,Indian & Fusionwear | Kurtis | green | festive...,#Designerkurti #wholesale #surat #womenwears,No,0
4,6425ac9d6fea7019f37f03cf,Indian & Fusionwear | Kurtis | yellow | casual...,#Designerkurti #wholesale #surat #womenwears,No,0


In [19]:
df.shape

(998, 5)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998 entries, 0 to 997
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   item_id       998 non-null    object
 1   tags          998 non-null    object
 2   hashtags      997 non-null    object
 3   interacted    998 non-null    object
 4   interactions  998 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 39.1+ KB


In [21]:
def important_features(dataset):
    data=dataset.copy()
    for i in range(0,data.shape[0]):
        data['tags'].fillna('', inplace=True)
        data['hashtags'].fillna('', inplace=True)
        data["feature"]=data["tags"]+' '+data["hashtags"]
    return data

In [22]:
df=important_features(df)
df["feature"]

0      Topwear | Men T-Shirts | white | printed | reg...
1      Topwear | Men T-Shirts | maroon | cotton | pri...
2      Women Topwear | pink | regular | casual | 3/4 ...
3      Indian & Fusionwear | Kurtis | green | festive...
4      Indian & Fusionwear | Kurtis | yellow | casual...
                             ...                        
993    Topwear | Sweatshirts | multi color | regular ...
994    Boys Clothing | Kids Jeans #jeans #boysjeans #...
995    Bottomwear | Men Jeans #menswear #mensjeans #j...
996    Bottomwear | Sports & Activewear | Men Jeans |...
997    Topwear | Sweatshirts | black | regular | wedd...
Name: feature, Length: 998, dtype: object

In [23]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)

In [24]:
# Importing vectoriser
vec=TfidfVectorizer()

In [25]:
# Converting the features into vectors
tfidf_matrix_train = vec.fit_transform(train_data['feature'])
tfidf_matrix_test = vec.transform(test_data['feature'])


In [26]:
#Similarity measurements stored
similarity=cosine_similarity(tfidf_matrix_test, tfidf_matrix_train)
similarity

array([[0.02935089, 0.45771933, 0.04202034, ..., 0.        , 0.02670476,
        0.06381131],
       [0.08461481, 0.0314042 , 0.03983023, ..., 0.        , 0.01992073,
        0.06580773],
       [0.09715775, 0.01429901, 0.91373989, ..., 0.0869278 , 0.20426244,
        0.34952908],
       ...,
       [0.01343592, 0.04011363, 0.03851834, ..., 0.        , 0.03635961,
        0.05753952],
       [0.09703048, 0.11594089, 0.25135105, ..., 0.0437215 , 0.47877051,
        0.34901675],
       [0.03170018, 0.05931547, 0.05510418, ..., 0.35420955, 0.05201591,
        0.06156474]])

In [27]:
#Debug step
print("sim shape:", similarity.shape)
print("interactions shape:", train_data['interactions'].shape)

sim shape: (300, 698)
interactions shape: (698,)


In [28]:
user_interactions = (train_data['interactions'] > 0).astype(int).values

# Hyperparameter1
similarity_threshold = 0.3

# Calculate the interaction rate for each test item
interaction_rates = np.sum(user_interactions * (similarity > similarity_threshold), axis=1) / np.sum(similarity > similarity_threshold, axis=1)

interaction_rates

array([0.        , 0.        , 0.00689655, 0.        , 0.        ,
       0.        , 1.        , 0.02      , 0.        , 0.        ,
       1.        , 0.01145038, 0.86666667, 0.        , 0.84210526,
       0.01219512, 0.9625    , 0.        , 0.        , 0.        ,
       0.01111111, 0.        , 1.        , 1.        , 1.        ,
       0.        , 0.02631579, 0.        , 0.91836735, 0.11363636,
       0.        , 0.11111111, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.0212766 , 0.        , 0.        ,
       0.01724138, 0.        , 1.        , 0.        , 0.04      ,
       0.        , 0.        , 0.        , 0.19047619, 0.58333333,
       0.07142857, 0.        , 0.        , 0.00980392, 0.01      ,
       0.015625  , 0.00425532, 0.        , 0.02439024, 0.        ,
       0.00873362, 1.        , 1.        , 0.02272727, 0.01970443,
       0.41666667, 0.01449275, 0.        , 0.00598802, 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.125  

In [29]:
# Hyperparameter2
interaction_threshold=0.3
# Predict interactions based on the modified logic
predicted_interactions = (interaction_rates > 0.3).astype(int)
predicted_interactions

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1])

In [30]:
# Compare predicted labels with actual labels
actual_labels = (test_data['interactions'] > 0).astype(int).values
actual_labels

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1])

In [31]:
# Calculate accuracy
accuracy = np.mean(predicted_interactions == actual_labels)
accuracy

0.98