# Import Library

In [64]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
import numpy as np
from sklearn.model_selection import train_test_split


# Import dataset

In [65]:
df=pd.read_csv(r"..\DATASET\recommendation system\1000_dummy_product.csv")
df.head()

Unnamed: 0,ID,Deskripsi Produk,Merek,Kategori,Harga Sewa,Preferensi Pengguna,Jumlah Stok
0,1,Kamera Mirrorless,Sony,Camera,200000,"Fotografi, Ringan, Kinerja",5
1,2,Speaker Bluetooth,JBL,Speaker,100000,"Bergaya, Portabel",3
2,3,Proyektor Mini,Epson,Proyektor,150000,"Kinerja, Mudah Digunakan",2
3,4,Playstation 5,Sony,Playstation,250000,"Mewah, Gaming",4
4,5,Tenda Camping 4 Orang,Coleman,Tenda,50000,"Nyaman, Ringan, Tahan Lama",6


In [66]:
df.tail()

Unnamed: 0,ID,Deskripsi Produk,Merek,Kategori,Harga Sewa,Preferensi Pengguna,Jumlah Stok
995,996,Tas Camping Deuter Aircontact Lite 40+10,Deuter,Tas Camping,1700000,"Tahan Air,Tahan Lama",5
996,997,Kamera Mirrorless Fujifilm X-T4,Fujifilm,Camera,8000000,"Fotografi,Kualitas Tinggi",3
997,998,Speaker Bluetooth Sony SRS-XB43,Sony,Speaker,3000000,"Portabel,Suara Kualitas Tinggi",4
998,999,Proyektor HD ViewSonic PX747-4K,ViewSonic,Proyektor,6500000,"Kualitas Gambar Tinggi, Terang",3
999,1000,PlayStation 4 Pro,Sony,Playstation,4000000,"Gaming,Kualitas Grafis Tinggi",3


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   1000 non-null   int64 
 1   Deskripsi Produk     1000 non-null   object
 2   Merek                1000 non-null   object
 3   Kategori             1000 non-null   object
 4   Harga Sewa           1000 non-null   int64 
 5   Preferensi Pengguna  1000 non-null   object
 6   Jumlah Stok          1000 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 54.8+ KB


# Prepocessing

### Data Cleaning

In [68]:
df.Kategori.unique()

array(['Camera', 'Speaker', 'Proyektor', 'Playstation', 'Tenda', 'Sepatu',
       'Matras', 'Tas', 'Furniture', 'Headset', 'Laptop', 'Mesin Cuci',
       'Smartphone', 'Sepeda', 'TV', 'Soundbar', 'Alat Musik',
       'Smartwatch', 'Perkakas', 'Dapur', 'Outdoor', 'Videocam', 'Audio',
       'Koper', 'Perangkat Keamanan', 'Printer', 'Monitor', 'Drone',
       'Alat Fitness', 'Perangkat Streaming', 'Mikrofon', 'Kamera Aksi',
       'Keyboard', 'Smart TV', 'Mesin Jahit', 'Kulkas', 'Kamera',
       'Tas Camping'], dtype=object)

In [69]:
df.Kategori.value_counts()

Speaker                140
Proyektor              105
Tenda                  104
Sepatu                 104
Matras                 103
Playstation            101
Kamera                  81
Tas                     67
Camera                  42
Tas Camping             36
Dapur                   22
Alat Musik              12
Laptop                  12
Perkakas                 8
Furniture                7
Smartphone               7
Outdoor                  7
Smart TV                 5
Monitor                  4
Drone                    4
Alat Fitness             4
Kulkas                   3
Mesin Jahit              3
Printer                  2
Perangkat Streaming      2
Headset                  2
Mikrofon                 2
Keyboard                 1
Kamera Aksi              1
Sepeda                   1
TV                       1
Mesin Cuci               1
Koper                    1
Audio                    1
Videocam                 1
Smartwatch               1
Soundbar                 1
P

In the "category" attribute there are several values that have similarities so that they can be combined into one value such as:
- 'camera' , 'Kamera Aksi' rename to 'Kamera'
- 'Tas Camping' rename to 'Tas'

In [70]:
for column in df.Kategori:
    df['Kategori'] = df['Kategori'].replace('Camera', 'Kamera')
    df['Kategori'] = df['Kategori'].replace('Kamera Aksi', 'Kamera')
    df['Kategori'] = df['Kategori'].replace('Tas Camping', 'Tas')


In [71]:
df.Kategori.value_counts()

Speaker                140
Kamera                 124
Proyektor              105
Tenda                  104
Sepatu                 104
Matras                 103
Tas                    103
Playstation            101
Dapur                   22
Alat Musik              12
Laptop                  12
Perkakas                 8
Outdoor                  7
Smartphone               7
Furniture                7
Smart TV                 5
Alat Fitness             4
Drone                    4
Monitor                  4
Mesin Jahit              3
Kulkas                   3
Printer                  2
Perangkat Streaming      2
Mikrofon                 2
Headset                  2
Videocam                 1
Audio                    1
Koper                    1
Perangkat Keamanan       1
Soundbar                 1
TV                       1
Sepeda                   1
Keyboard                 1
Mesin Cuci               1
Smartwatch               1
Name: Kategori, dtype: int64

### Feature Selection

In [72]:
# DROP "ID" 
df1 = df.drop(['ID'], axis=1)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Deskripsi Produk     1000 non-null   object
 1   Merek                1000 non-null   object
 2   Kategori             1000 non-null   object
 3   Harga Sewa           1000 non-null   int64 
 4   Preferensi Pengguna  1000 non-null   object
 5   Jumlah Stok          1000 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 47.0+ KB


### Normalization

In [73]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Take the "Harga Sewa" column and reshape it into an array
harga_sewa = df1['Harga Sewa'].values.reshape(-1, 1)

# Perform Min-Max Scaling on the "Harga Sewa" column
harga_sewa_scaled = scaler.fit_transform(harga_sewa)

# Replace the values in the "Harga Sewa" column with the scaled values
df1['Harga Sewa'] = harga_sewa_scaled

df1.head()


Unnamed: 0,Deskripsi Produk,Merek,Kategori,Harga Sewa,Preferensi Pengguna,Jumlah Stok
0,Kamera Mirrorless,Sony,Kamera,0.008513,"Fotografi, Ringan, Kinerja",5
1,Speaker Bluetooth,JBL,Speaker,0.003505,"Bergaya, Portabel",3
2,Proyektor Mini,Epson,Proyektor,0.006009,"Kinerja, Mudah Digunakan",2
3,Playstation 5,Sony,Playstation,0.011017,"Mewah, Gaming",4
4,Tenda Camping 4 Orang,Coleman,Tenda,0.001002,"Nyaman, Ringan, Tahan Lama",6


### Text Standardization

In [74]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [75]:
# Initialize WordNetLemmatizers
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text into words
    tokens = nltk.word_tokenize(text)
    # Lemmatize words
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join the tokens back into a single string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

# Standardize text in "Deskripsi Produk" attribute
df1['Deskripsi Produk'] = df1['Deskripsi Produk'].apply(preprocess_text)

# Standardize text in "Preferensi Pengguna" attribute
df1['Preferensi Pengguna'] = df1['Preferensi Pengguna'].apply(preprocess_text)


In [76]:
df1.head()

Unnamed: 0,Deskripsi Produk,Merek,Kategori,Harga Sewa,Preferensi Pengguna,Jumlah Stok
0,kamera mirrorless,Sony,Kamera,0.008513,fotografi ringan kinerja,5
1,speaker bluetooth,JBL,Speaker,0.003505,bergaya portabel,3
2,proyektor mini,Epson,Proyektor,0.006009,kinerja mudah digunakan,2
3,playstation 5,Sony,Playstation,0.011017,mewah gaming,4
4,tenda camping 4 orang,Coleman,Tenda,0.001002,nyaman ringan tahan lama,6


### Label Encoding

In [77]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encoding attribute "Merek"
df1['Merek'] = label_encoder.fit_transform(df1['Merek'])

# Encoding attribute "Kategori"
df1['Kategori'] = label_encoder.fit_transform(df1['Kategori'])

df1.head()

Unnamed: 0,Deskripsi Produk,Merek,Kategori,Harga Sewa,Preferensi Pengguna,Jumlah Stok
0,kamera mirrorless,119,7,0.008513,fotografi ringan kinerja,5
1,speaker bluetooth,58,30,0.003505,bergaya portabel,3
2,proyektor mini,37,23,0.006009,kinerja mudah digunakan,2
3,playstation 5,119,21,0.011017,mewah gaming,4
4,tenda camping 4 orang,26,33,0.001002,nyaman ringan tahan lama,6


# Feature Representation

In [78]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# TF-IDF representation for "Deskripsi Produk" feature
deskripsi_produk_tfidf = tfidf_vectorizer.fit_transform(df1['Deskripsi Produk'])

# TF-IDF representation for "Preferensi Pengguna" feature
preferensi_pengguna_tfidf = tfidf_vectorizer.fit_transform(df1['Preferensi Pengguna'])

# Display TF-IDF representation
print("TF-IDF representation for Deskripsi Produk:")
print(deskripsi_produk_tfidf)


TF-IDF representation for Deskripsi Produk:
  (0, 258)	0.7638715396014307
  (0, 201)	0.6453683219580426
  (1, 71)	0.7311768492744607
  (1, 362)	0.6821879616975607
  (2, 257)	0.8269669002582273
  (2, 318)	0.562250607716256
  (3, 310)	1.0
  (4, 287)	0.5929281422476479
  (4, 83)	0.5410735530002388
  (4, 389)	0.5963854696204887
  (5, 279)	0.9115537607917187
  (5, 344)	0.4111809105326683
  (6, 396)	0.5773502691896257
  (6, 47)	0.5773502691896257
  (6, 243)	0.5773502691896257
  (7, 39)	0.3973893769189186
  (7, 49)	0.5833773282542822
  (7, 322)	0.6453733120327367
  (7, 384)	0.2919689437368037
  (8, 121)	0.809367920986093
  (8, 201)	0.5873019397879164
  (9, 313)	0.8358853756860039
  (9, 362)	0.5489040341574
  (10, 226)	0.7380596781032934
  (10, 216)	0.6747354381963814
  :	:
  (993, 344)	0.25639102281719106
  (994, 350)	0.5893423610719554
  (994, 396)	0.4664316961235706
  (994, 47)	0.4664316961235706
  (994, 243)	0.4664316961235706
  (995, 15)	0.4692441679620214
  (995, 228)	0.4692441679620214


In [79]:
print("TF-IDF representation for Preferensi Pengguna:")
print(preferensi_pengguna_tfidf)

TF-IDF representation for Preferensi Pengguna:
  (0, 62)	0.6360067288303902
  (0, 118)	0.5950587820898929
  (0, 34)	0.4913252351957509
  (1, 99)	0.5031899935419127
  (1, 11)	0.8641758098901462
  (2, 20)	0.5702079123319302
  (2, 80)	0.4200176127639481
  (2, 62)	0.7060085988726594
  (3, 38)	0.6760467647511214
  (3, 75)	0.7368587190700414
  (4, 69)	0.38168181274084656
  (4, 128)	0.432993950517232
  (4, 87)	0.4303434400617174
  (4, 118)	0.694002706215321
  (5, 87)	0.45293490364750666
  (5, 11)	0.8915435901052869
  (6, 102)	0.8570041022030946
  (6, 87)	0.5153095854018899
  (7, 1)	0.622684584389083
  (7, 128)	0.3561410399153048
  (7, 11)	0.6967262504400421
  (8, 37)	0.5307903723431686
  (8, 67)	0.3863357883134183
  (8, 34)	0.7543250223186619
  (9, 12)	0.6629284444215602
  :	:
  (990, 37)	0.5484802110220538
  (990, 67)	0.3992113379225319
  (991, 39)	0.6868814219576492
  (991, 44)	0.6412146413461756
  (991, 135)	0.3421077255671571
  (992, 90)	0.7396710664782948
  (992, 23)	0.6729685827844134
 

# Cosine Similarity

In [80]:
# Combine the two representation matrices into one matrix
feature_matrix = sp.hstack((deskripsi_produk_tfidf, preferensi_pengguna_tfidf))

# Compute similarities between items using cosine similarity
similarities = cosine_similarity(feature_matrix)

# Display the similarity matrix
print("Similarity Matrix:")
print(similarities)

Similarity Matrix:
[[1.         0.         0.22451311 ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.27803423 0.         0.        ]
 [0.22451311 0.         1.         ... 0.         0.10028602 0.        ]
 ...
 [0.         0.27803423 0.         ... 1.         0.16215878 0.06896961]
 [0.         0.         0.10028602 ... 0.16215878 1.         0.06045683]
 [0.         0.         0.         ... 0.06896961 0.06045683 1.        ]]


# Similarity Clustering

In [81]:
# Grouping Items Based on Similarity
def group_similarities(similarity_matrix, k):
    n_items = similarity_matrix.shape[0]
    similar_items = []

    for i in range(n_items):
        # Get the similarities of item i with other items
        similarities = similarity_matrix[i]

        # Sort the indices of items based on similarity values
        sorted_indices = np.argsort(similarities)[::-1]

        # Get the top k items with the highest similarity
        top_k_indices = sorted_indices[1:k+1]  # Exclude similarity with itself

        # Store the items with the highest similarity in a list
        similar_items.append(top_k_indices)

    return similar_items

# Example usage
k = 5  # Number of top items to be retrieved
similar_items = group_similarities(similarities, k)

# Display the results of similarity grouping
for i, items in enumerate(similar_items):
    print("Item", i+1, ":")
    for item in items:
        print("  - Item", item+1)


Item 1 :
  - Item 408
  - Item 176
  - Item 109
  - Item 435
  - Item 471
Item 2 :
  - Item 178
  - Item 111
  - Item 146
  - Item 157
  - Item 84
Item 3 :
  - Item 401
  - Item 173
  - Item 106
  - Item 437
  - Item 43
Item 4 :
  - Item 403
  - Item 412
  - Item 512
  - Item 430
  - Item 421
Item 5 :
  - Item 91
  - Item 147
  - Item 193
  - Item 126
  - Item 431
Item 6 :
  - Item 2
  - Item 8
  - Item 405
  - Item 468
  - Item 414
Item 7 :
  - Item 515
  - Item 563
  - Item 160
  - Item 539
  - Item 587
Item 8 :
  - Item 94
  - Item 161
  - Item 6
  - Item 2
  - Item 27
Item 9 :
  - Item 144
  - Item 190
  - Item 123
  - Item 83
  - Item 155
Item 10 :
  - Item 139
  - Item 90
  - Item 125
  - Item 192
  - Item 438
Item 11 :
  - Item 41
  - Item 16
  - Item 189
  - Item 122
  - Item 7
Item 12 :
  - Item 59
  - Item 42
  - Item 10
  - Item 96
  - Item 66
Item 13 :
  - Item 89
  - Item 55
  - Item 145
  - Item 152
  - Item 191
Item 14 :
  - Item 416
  - Item 46
  - Item 200
  - Item 133

# Content Based Filtering 

In [82]:
def content_based_filtering(item_id, similar_items, df):
    recommendations = []

    for item in similar_items[item_id]:
        recommendations.append(item)

    # Remove the item itself from the recommendations list
    recommendations = list(set(recommendations) - set([item_id]))

    # Get the product data based on the IDs if the IDs exist in the DataFrame index
    recommended_products = df[df.index.isin(recommendations)]

    return recommended_products


### Example Usage: Content-Based Filtering for Recommending Products".

In [83]:
print(df1.loc[1])

# Example usage
item_id = 1  # ID of the item to be recommended
recommended_products = content_based_filtering(item_id, similar_items, df1)

# Displaying recommended products
print("\n Recommended Products for Item", item_id+1, ":")
recommended_products



Deskripsi Produk       speaker bluetooth
Merek                                 58
Kategori                              30
Harga Sewa                      0.003505
Preferensi Pengguna     bergaya portabel
Jumlah Stok                            3
Name: 1, dtype: object

 Recommended Products for Item 2 :


Unnamed: 0,Deskripsi Produk,Merek,Kategori,Harga Sewa,Preferensi Pengguna,Jumlah Stok
83,speaker bluetooth sony,119,30,0.006009,portabel suara jernih,5
110,speaker bluetooth jbl,58,30,0.008513,portabel suara kualitas tinggi,5
145,speaker bluetooth jbl,58,30,0.011017,portabel suara kualitas tinggi,5
156,speaker bluetooth sony,119,30,0.01352,portabel suara jernih,5
177,speaker bluetooth jbl,58,30,0.008513,portabel suara kualitas tinggi,5


In [84]:

print(df1.loc[562])

# Example usage
item_id = 562  # ID of the item to be recommended
recommended_products = content_based_filtering(item_id, similar_items, df1)

# Displaying recommended products
print("\n Recommended Products for Item", item_id+1, ":")
recommended_products




Deskripsi Produk       matras angin tidur double
Merek                                         13
Kategori                                      12
Harga Sewa                              0.008513
Preferensi Pengguna            nyaman tahan lama
Jumlah Stok                                    6
Name: 562, dtype: object

 Recommended Products for Item 563 :


Unnamed: 0,Deskripsi Produk,Merek,Kategori,Harga Sewa,Preferensi Pengguna,Jumlah Stok
180,matras angin tidur queen,117,12,0.008513,nyaman tahan lama,6
514,matras angin tidur double,13,12,0.008513,nyaman tahan lama,6
530,matras angin tidur queen,13,12,0.011017,nyaman tahan lama,6
538,matras angin tidur double,57,12,0.008513,nyaman tahan lama,7


In [85]:
print(df1.loc[31])

# Example usage
item_id = 31  # ID of the item to be recommended
recommended_products = content_based_filtering(item_id, similar_items, df1)

# Displaying recommended products
print("\n Recommended Products for Item", item_id+1, ":")
recommended_products

Deskripsi Produk                   drone dji
Merek                                     30
Kategori                                   4
Harga Sewa                           0.01352
Preferensi Pengguna    kamera terbang stabil
Jumlah Stok                                4
Name: 31, dtype: object

 Recommended Products for Item 32 :


Unnamed: 0,Deskripsi Produk,Merek,Kategori,Harga Sewa,Preferensi Pengguna,Jumlah Stok
61,drone dji mini,30,4,0.011017,portabel kualitas rekaman tinggi,3
103,drone dji phantom,30,4,0.023535,kamera terbang stabil,4
119,smartphone samsung galaxy,112,27,0.043565,performa tinggi kualitas kamera,3
170,drone dji phantom,30,4,0.023535,kamera terbang stabil,4
186,smartphone samsung galaxy,112,27,0.043565,performa tinggi kualitas kamera,3


In [86]:
print(df1.loc[900])

# Example usage
item_id = 900  # ID of the item to be recommended
recommended_products = content_based_filtering(item_id, similar_items, df1)

# Displaying recommended products
print("\n Recommended Products for Item", item_id+1, ":")
recommended_products

Deskripsi Produk       kamera mirrorless canon eos r6
Merek                                              21
Kategori                                            7
Harga Sewa                                   0.499249
Preferensi Pengguna          fotografikualitas tinggi
Jumlah Stok                                         2
Name: 900, dtype: object

 Recommended Products for Item 901 :


Unnamed: 0,Deskripsi Produk,Merek,Kategori,Harga Sewa,Preferensi Pengguna,Jumlah Stok
732,kamera mirrorless canon eos r6,21,7,0.599399,fotografikualitas tinggi,2
764,kamera mirrorless canon eos r6,21,7,0.599399,fotografikualitas tinggi,2
864,kamera mirrorless canon eos r6,21,7,0.499249,fotografikualitas tinggi,2
916,kamera mirrorless canon eos r6,21,7,0.499249,fotografikualitas tinggi,2
980,kamera mirrorless canon eos r6,21,7,0.499249,fotografikualitas tinggi,2


# Evaluate Model

In [87]:
def evaluate_model(actual_recommendations, predicted_recommendations):
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    for i in range(len(actual_recommendations)):
        actual = set(actual_recommendations[i])
        if i < len(predicted_recommendations):  
            predicted = set(predicted_recommendations[i])
            true_positives += len(actual.intersection(predicted))
            false_positives += len(predicted.difference(actual))
            false_negatives += len(actual.difference(predicted))
        else:
            false_negatives += len(actual)  

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    # Perhitungan akurasi
    total_predictions = true_positives + false_positives
    accuracy = true_positives / total_predictions if total_predictions != 0 else 0

    return precision, recall, f1_score, accuracy


In [88]:
#SPLIT DATA
train_data, test_data = train_test_split(df1, test_size=0.2, random_state=42)


# Use the model to make recommendations on the test data
recommendations = []
for item_id in test_data.index:
    if any(item_id in items for items in similar_items):
        recommended_products = content_based_filtering(item_id, similar_items, train_data)
        recommendations.append(recommended_products)

# Convert the model's recommendation data to a list
predicted_recommendations = recommended_products['Deskripsi Produk'].tolist()
# Evaluate the model's performance with the test data
actual_recommendations = test_data['Deskripsi Produk'].tolist()
precision, recall, f1_score, accuracy = evaluate_model(actual_recommendations, predicted_recommendations)

# Display the evaluation results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)



Accuracy: 0.84
Precision: 0.84
Recall: 0.014588398749565822
F1-score: 0.02867872994195971
