In [1]:
import pandas as pd
import pyodbc
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import math
from sklearn.neighbors import NearestNeighbors
from statistics import mode

# Display options of pandas
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Database Connection

In [2]:
class DBConnector:
    def __init__(self, server_name, username, password, db_name, connection_str):
        self.sever_name = server_name
        self.username = username
        self.password = password
        self.db_name = db_name
        self.connection_str = connection_str
        self.connection = pyodbc.connect(self.connection_str)

    def test_connection(self):
        try:
            db = self.connection
            cursor = db.cursor()
            cursor.execute("SELECT @@VERSION")
            results = cursor.fetchone()
            # Check if anything at all is returned
            if results:
                return True
            else:
                return False
        except pyodbc.Error as ex:
            print(ex)
            print("Error in connection!")
            sqlstate = ex.args[0]
            if sqlstate == '28000':
                print("LDAP Connection failed: check password")
        return False

    def query(self, query_str):
        """This function returns a dataframe of result list"""
        return pd.read_sql_query(query_str, self.connection)

    def all_columns_name(self, table_name):
        """This function returns a dataframe of result list"""
        query_str = f"select COLUMN_NAME from INFORMATION_SCHEMA.COLUMNS where TABLE_NAME='{table_name}'"
        print("All columns names: ", query_str)
        return pd.read_sql_query(query_str, self.connection)

In [3]:
connection_str = "Driver={SQL Server};Server={server_name};UID={username};PWD={password};Database={db_name};"
driver="SQL Server"
servername = 'QUOC-CUONG'
username = 'sa'
password = 'cuong300599'
db_name = 'OnlinePhoneShopJoin'
# str_for_connection = "Driver={SQL Server};Server=QUOC-CUONG;UID=sa;PWD=cuong300599;Database=OnlinePhoneShopJoin;"
str_for_connection = "Driver={driver};Server={servername};UID={username};PWD={password};Database={db_name};"\
    .format(driver=driver, servername=servername, username=username, password=password, db_name=db_name)

connector = DBConnector(servername, username, password, db_name, str_for_connection)
connect_success = connector.test_connection()
if (connect_success):
    print('Connect successfully!')
else:
    print('Connect failed!')

Connect successfully!


# Preprocessing data

In [4]:
query_str = "SELECT * FROM dbo.all_products"
df_all_products = connector.query(query_str)
# print("All product list:\n", df_all_products)
df_all_products.head()

Unnamed: 0,product_id,product_name,unit_price,quantity,discount,description,image,available,special,view_count,warranty,brand_id,manufacturer_id,category_id,common_coef,entertain_coef,gaming_coef,created_date,updated_date,imei_no,model,battery_power,bluetooth,clock_speed,front_cam,in_memory,n_cores,n_sim,other_specification,px_height,px_width,ram,refresh_rate,screen_height,screen_width,support_3g,support_4g,support_5g,touch_screen,wifi,compatible_devices,functions,label
0,PD041020210001,Máy tính bảng OPPO đời mới,2000000.0,88,0.0,Đời mới hiện đại,"b'RIFF\xc2\x03\x01\x00WEBPVP8X\n\x00\x00\x00,\...",True,False,36,24,OPPO,1,TABLET,,,,2021-10-04 14:46:31.8130000,2021-10-16 16:04:58.3610000,A234567890BCD34,Q123456,,,,,,,,,,,,,,,,,,,,,,1.0
1,PD041020210002,Tay nghe Bluetooth,2000000.0,80,0.0,"Đời mới hiện đại, đem lại trải nghiệm âm thanh...",,True,False,42,24,OPPO,1,ACCE,,,,2021-10-04 14:46:41.7080000,2021-10-16 16:04:58.6830000,,,,,,,,,,,,,,,,,,,,,,"Laptop, máy tính bảng SAMSUNG",Hỗ trợ nghe nhạc chất lượng cao,1.0
2,PD041020210003,Điện Thoại Vsmart Joy 4 - Hàng Chính Hãng,3059000.0,200,0.0,<p>Màn hình: Full HD+ Hệ điều hành: Android 10...,b'RIFF\xbc1\x00\x00WEBPVP8X\n\x00\x00\x00\x08\...,True,False,56,12,SAMSUNG,1,SMPH,1.0,0.6,0.3,2021-10-04 14:47:12.8600000,2021-10-16 16:04:58.9120000,ABCDE1234567867,A1234567,5000.0,True,2.0,13.0,256.0,4.0,2.0,"<figure class=""table""><table><tbody><tr><td>Bl...",1080.0,2340.0,4.0,120.0,6.7,6.7,True,True,,True,True,,,1.0
3,PD041020210004,Tay nghe Bluetooth 1,2000000.0,88,0.0,"Đời mới hiện đại, đem lại trải nghiệm âm thanh...",,True,False,34,24,OPPO,1,ACCE,,,,2021-10-04 14:47:19.7510000,2021-10-16 16:04:58.6860000,,,,,,,,,,,,,,,,,,,,,,"Laptop, máy tính bảng SAMSUNG",Hỗ trợ nghe nhạc chất lượng cao,1.0
4,PD041020210005,Máy tính bảng OPPO 1,10000000.0,200,0.12,"Máy tính bảng, RAM 4GB, CPU chip SnapDragon",,True,False,21,12,OPPO,1,TABLET,,,,2021-10-04 14:47:27.9030000,2021-10-16 16:04:58.9120000,ABCDE1234567864,A1234567,,,,,,,,,,,,,,,,,,,,,,


In [5]:
exclude_cols = ['quantity', 'description', 'image', 'available', 'special', 'view_count', 'brand_id', 'category_id', 'manufacturer_id', 'created_date', 'updated_date']
df_features = df_all_products[df_all_products.columns.difference(exclude_cols)]
# df_features['battery_power'] = df_features['battery_power'].fillna(0)
# df_features['clock_speed'] = df_features['clock_speed'].fillna(0)
df_features = df_features.fillna(0)
print("Dataframe feaatures:\n")
print("All colums: ", df_features.columns)
print("Number of columns: ", len(df_features.columns))
df_features.head()

Dataframe feaatures:

All colums:  Index(['battery_power', 'bluetooth', 'clock_speed', 'common_coef', 'compatible_devices', 'discount', 'entertain_coef', 'front_cam', 'functions', 'gaming_coef', 'imei_no', 'in_memory', 'label', 'model', 'n_cores', 'n_sim', 'other_specification', 'product_id', 'product_name', 'px_height', 'px_width', 'ram', 'refresh_rate', 'screen_height', 'screen_width', 'support_3g', 'support_4g', 'support_5g', 'touch_screen', 'unit_price', 'warranty', 'wifi'], dtype='object')
Number of columns:  32


Unnamed: 0,battery_power,bluetooth,clock_speed,common_coef,compatible_devices,discount,entertain_coef,front_cam,functions,gaming_coef,imei_no,in_memory,label,model,n_cores,n_sim,other_specification,product_id,product_name,px_height,px_width,ram,refresh_rate,screen_height,screen_width,support_3g,support_4g,support_5g,touch_screen,unit_price,warranty,wifi
0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,A234567890BCD34,0.0,1.0,Q123456,0.0,0.0,0,PD041020210001,Máy tính bảng OPPO đời mới,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2000000.0,24,0
1,0.0,0,0.0,0.0,"Laptop, máy tính bảng SAMSUNG",0.0,0.0,0.0,Hỗ trợ nghe nhạc chất lượng cao,0.0,0,0.0,1.0,0,0.0,0.0,0,PD041020210002,Tay nghe Bluetooth,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2000000.0,24,0
2,5000.0,True,2.0,1.0,0,0.0,0.6,13.0,0,0.3,ABCDE1234567867,256.0,1.0,A1234567,4.0,2.0,"<figure class=""table""><table><tbody><tr><td>Bl...",PD041020210003,Điện Thoại Vsmart Joy 4 - Hàng Chính Hãng,1080.0,2340.0,4.0,120.0,6.7,6.7,True,True,0,True,3059000.0,12,True
3,0.0,0,0.0,0.0,"Laptop, máy tính bảng SAMSUNG",0.0,0.0,0.0,Hỗ trợ nghe nhạc chất lượng cao,0.0,0,0.0,1.0,0,0.0,0.0,0,PD041020210004,Tay nghe Bluetooth 1,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2000000.0,24,0
4,0.0,0,0.0,0.0,0,0.12,0.0,0.0,0,0.0,ABCDE1234567864,0.0,0.0,A1234567,0.0,0.0,0,PD041020210005,Máy tính bảng OPPO 1,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,10000000.0,12,0


In [6]:
# Get the values of these text columns
# df_features[['compatible_devices', 'functions']].head()

# list_compatibles = df_features['compatible_devices'].tolist()
# list_compatibles = ['' if x == 0 else x for x in list_compatibles]
# print("List  of column values: ", list_compatibles)
# print("Length of compatible devices list: ", len(list_compatibles))

# vectorizer = TfidfVectorizer()
# vectorizer_Tf = vectorizer.fit_transform(list_compatibles)
# print("Vectorizer: ", vectorizer_Tf)

# dense_matrix = vectorizer_Tf.todense()
# print("\nDense Array:\n",  dense_matrix)
# df_features[['compatible_devices']].head(10)

In [7]:
# Print the shape of dense matrix
# print("Dense matrix's shape: ", dense_matrix.shape)
# list_compatible_scores=[]
# for row in dense_matrix:
#     representative_vector =  row.tolist()
#     print("Row: ", representative_vector[0])
#     print("Type: ", type(representative_vector[0]))
#     score = round(sum(representative_vector[0])/len(representative_vector[0]), 3) 
#     print("Average score: ", round(score, 3))
#     list_compatible_scores.append(score)
    
# print("List of avarage scores: ", list_compatible_scores)

In [8]:
df_features_non_text = df_features

# # Replace the text value by list of scores
# print("After replacing text value:\n")
# df_features_non_text['compatible_devices'] = np.array(list_compatible_scores)
# df_features_non_text.head(10)

In [9]:
def convert_str_to_avg_score(list_str):
    list_feature = ['' if x == 0 else x for x in list_str]
    vectorizer = TfidfVectorizer()
    vectorizer_Tf = vectorizer.fit_transform(list_feature)
    dense_matrix = vectorizer_Tf.todense()
    print("Dense matrix: \n", dense_matrix)
#     print("Vector to dense: ", vectorizer_Tf, " and its shape: ", vectorizer_Tf.shape)
    
    list_avg_scores = []
    for row in dense_matrix:
        representative_vector =  row.tolist()
        score =  round(sum(representative_vector[0])/ len(representative_vector[0]), 3)
        list_avg_scores.append(score)
        
    return list_avg_scores

In [10]:
list_compatibles = df_features['compatible_devices'].tolist()
list_compatibles = ['' if x == 0 else x for x in list_compatibles]

list_avg_compatible_scores = convert_str_to_avg_score(list_compatibles)
print("List of average scores of compatible: ", list_avg_compatible_scores)

list_functions = df_features['functions'].tolist()
list_functions = ['' if x == 0 else x for x in list_functions]
print("List of functions: ", list_functions)
list_avg_functions_score = convert_str_to_avg_score(list_functions)
print("List of average scores of functions: ", list_avg_functions_score)

df_features_non_text = df_features
print("Before replacing text value:\n")
df_features_non_text.head()

# Replace the text value by list of scores
print("After replacing text value:\n")
df_features_non_text['compatible_devices'] = np.array(list_avg_compatible_scores)
df_features_non_text['functions'] = np.array(list_avg_functions_score)

df_features_non_text.head(10)

Dense matrix: 
 [[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.4472136  0.         0.4472136  0.
  0.         0.4472136  0.4472136  0.4472136  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.4472136  0.         0.4472136  0.
  0.         0.4472136  0.4472136  0.4472136  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.5        0.         0.         0.         0.         0.5
  0.5        0.         0.         0.         0.         0.5       ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.     

Unnamed: 0,battery_power,bluetooth,clock_speed,common_coef,compatible_devices,discount,entertain_coef,front_cam,functions,gaming_coef,imei_no,in_memory,label,model,n_cores,n_sim,other_specification,product_id,product_name,px_height,px_width,ram,refresh_rate,screen_height,screen_width,support_3g,support_4g,support_5g,touch_screen,unit_price,warranty,wifi
0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A234567890BCD34,0.0,1.0,Q123456,0.0,0.0,0,PD041020210001,Máy tính bảng OPPO đời mới,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2000000.0,24,0
1,0.0,0,0.0,0.0,0.186,0.0,0.0,0.0,0.203,0.0,0,0.0,1.0,0,0.0,0.0,0,PD041020210002,Tay nghe Bluetooth,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2000000.0,24,0
2,5000.0,True,2.0,1.0,0.0,0.0,0.6,13.0,0.0,0.3,ABCDE1234567867,256.0,1.0,A1234567,4.0,2.0,"<figure class=""table""><table><tbody><tr><td>Bl...",PD041020210003,Điện Thoại Vsmart Joy 4 - Hàng Chính Hãng,1080.0,2340.0,4.0,120.0,6.7,6.7,True,True,0,True,3059000.0,12,True
3,0.0,0,0.0,0.0,0.186,0.0,0.0,0.0,0.203,0.0,0,0.0,1.0,0,0.0,0.0,0,PD041020210004,Tay nghe Bluetooth 1,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2000000.0,24,0
4,0.0,0,0.0,0.0,0.0,0.12,0.0,0.0,0.0,0.0,ABCDE1234567864,0.0,0.0,A1234567,0.0,0.0,0,PD041020210005,Máy tính bảng OPPO 1,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,10000000.0,12,0
5,0.0,0,0.0,0.0,0.167,0.003,0.0,0.0,0.188,0.0,0,0.0,1.0,0,0.0,0.0,0,PD081020210006,Tai nghe bluetooth hiện đại,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,50000.0,2,0
6,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,445564760391019,0.0,1.0,A12,0.0,0.0,0,PD081020210007,Phone 23,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1000000.0,24,0
7,3500.0,True,1.2,0.0,0.0,0.0,0.0,14.0,0.0,0.0,449264287097621,256.0,1.0,iphone 12 Pro Max 256g,4.0,1.0,"<figure class=""table""><table><tbody><tr><td>Bă...",PD151020210001,Điện Thoại iPhone 12 Pro Max 256GB - Hàng Chín...,1284.0,2778.0,16.0,0.0,6.7,6.7,True,False,False,True,30990000.0,23,True
8,5000.0,True,2.0,1.0,0.0,0.0,0.7,16.0,0.0,0.4,998640240981396,128.0,1.0,A1234567,3.0,2.0,"<figure class=""table""><table><tbody><tr><td>Bl...",PD161020210001,Điện Thoại Oppo A54 - Hàng Chính Hãng,720.0,1600.0,6.0,124.0,6.5,6.5,True,False,False,True,4290000.0,15,True
9,0.0,0,0.0,0.1,0.144,0.0,0.5,0.0,0.188,0.4,0,0.0,2.0,0,0.0,0.0,"<figure class=""table""><table><tbody><tr><td>Th...",PD161020210008,Tai Nghe Nhét Tai JBL C150SI - Hàng Chính Hãng,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,203000.0,1,0


# KNN Algorithm

In [11]:
def mean(labels):
    return sum(labels) / len(labels)

def mode(labels):
    return mode(labels)
#     return Counter(labels).most_common(1)[0][0]

def euclidean_distance(point1, point2):
    sum_squared_distance = 0
    print("point 1: " , point1, " and its shape: " , len(point1))
    print("point 2: " , point2, " and its shape: " , len(point2))
    for i in range(len(point1)):
        sum_squared_distance += math.pow(point1[i] - point2[i], 2)
    return math.sqrt(sum_squared_distance)

In [12]:
def knn(data, query, k, distance_fn, choice_fn):
    neighbor_distances_and_indices = []
    
    # 3. For each example in the data
    for index, example in enumerate(data):
        # 3.1 Calculate the distance between the query example and the current
        # example from the data.
        distance = distance_fn(example[:-1], query)
        
        # 3.2 Add the distance and the index of the example to an ordered collection
        neighbor_distances_and_indices.append((distance, index))
    
    # 4. Sort the ordered collection of distances and indices from
    # smallest to largest (in ascending order) by the distances
    sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices)
    
    # 5. Pick the first K entries from the sorted collection
    k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k]
    
    # 6. Get the labels of the selected K entries
    k_nearest_labels = [data[i][-1] for distance, i in k_nearest_distances_and_indices]

    # 7. If regression (choice_fn = mean), return the average of the K labels
    # 8. If classification (choice_fn = mode), return the mode of the K labels
    return k_nearest_distances_and_indices , choice_fn(k_nearest_labels)

## Fed data into algorithm

In [13]:
exclude_cols_for_knn = ['imei_no', 'product_id', 'product_name', 'other_specification', 'model']
data = df_features_non_text[df_features_non_text.columns.difference(exclude_cols_for_knn)]
data.head(10)

Unnamed: 0,battery_power,bluetooth,clock_speed,common_coef,compatible_devices,discount,entertain_coef,front_cam,functions,gaming_coef,in_memory,label,n_cores,n_sim,px_height,px_width,ram,refresh_rate,screen_height,screen_width,support_3g,support_4g,support_5g,touch_screen,unit_price,warranty,wifi
0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2000000.0,24,0
1,0.0,0,0.0,0.0,0.186,0.0,0.0,0.0,0.203,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2000000.0,24,0
2,5000.0,True,2.0,1.0,0.0,0.0,0.6,13.0,0.0,0.3,256.0,1.0,4.0,2.0,1080.0,2340.0,4.0,120.0,6.7,6.7,True,True,0,True,3059000.0,12,True
3,0.0,0,0.0,0.0,0.186,0.0,0.0,0.0,0.203,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,2000000.0,24,0
4,0.0,0,0.0,0.0,0.0,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,10000000.0,12,0
5,0.0,0,0.0,0.0,0.167,0.003,0.0,0.0,0.188,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,50000.0,2,0
6,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1000000.0,24,0
7,3500.0,True,1.2,0.0,0.0,0.0,0.0,14.0,0.0,0.0,256.0,1.0,4.0,1.0,1284.0,2778.0,16.0,0.0,6.7,6.7,True,False,False,True,30990000.0,23,True
8,5000.0,True,2.0,1.0,0.0,0.0,0.7,16.0,0.0,0.4,128.0,1.0,3.0,2.0,720.0,1600.0,6.0,124.0,6.5,6.5,True,False,False,True,4290000.0,15,True
9,0.0,0,0.0,0.1,0.144,0.0,0.5,0.0,0.188,0.4,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,203000.0,1,0


In [14]:
data = data.values.tolist()
print("data length: ", len(data))

data length:  10


In [15]:
# query_item = [4000, True, 1.0,  0.8, 0.000, 0.000, 0.4, 8, 0.000, 0.3, 128, 1.0, 4.0, 1.0, 1200, 2400, 4.0, 120.0, 6.2, 6.2, True, True, False, True, 30100000.0, 12]
query_item = [0.0, True, 1.0,  0.8, 0.000, 0.000, 0.4, 8, 0.000, 0.3, 128, 1.0, 4.0, 1.0, 1200, 2400, 4.0, 120.0, 6.2, 6.2, True, True, False, True, 30100000.0, 12]
k_nearest_neighbors, _ = knn(
        data, query_item, k=5, distance_fn=euclidean_distance, choice_fn=lambda x: None
)

print("Nearest neighbors: ",  k_nearest_neighbors)

point 1:  [0.0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0, 2000000.0, 24]  and its shape:  26
point 2:  [4000, True, 1.0, 0.8, 0.0, 0.0, 0.4, 8, 0.0, 0.3, 128, 1.0, 4.0, 1.0, 1200, 2400, 4.0, 120.0, 6.2, 6.2, True, True, False, True, 30100000.0, 12]  and its shape:  26
point 1:  [0.0, 0, 0.0, 0.0, 0.186, 0.0, 0.0, 0.0, 0.203, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0, 0, 2000000.0, 24]  and its shape:  26
point 2:  [4000, True, 1.0, 0.8, 0.0, 0.0, 0.4, 8, 0.0, 0.3, 128, 1.0, 4.0, 1.0, 1200, 2400, 4.0, 120.0, 6.2, 6.2, True, True, False, True, 30100000.0, 12]  and its shape:  26
point 1:  [5000.0, True, 2.0, 1.0, 0.0, 0.0, 0.6000000238418579, 13.0, 0.0, 0.30000001192092896, 256.0, 1.0, 4.0, 2.0, 1080.0, 2340.0, 4.0, 120.0, 6.699999809265137, 6.699999809265137, True, True, 0, True, 3059000.0, 12]  and its shape:  26
point 2:  [4000, True, 1.0, 0.8, 0.0, 0.0, 0.4, 8, 0.0, 0.3, 128, 1.0, 4.0, 1.0, 1200, 2400

In [16]:
list_name_info = df_all_products['product_name'].tolist()

print("Recommendations: \n")
for _, index in k_nearest_neighbors:
    print(list_name_info[index])

Recommendations: 

Điện Thoại iPhone 12 Pro Max 256GB - Hàng Chính Hãng
Máy tính bảng OPPO 1
Điện Thoại Oppo A54 - Hàng Chính Hãng
Điện Thoại Vsmart Joy 4 - Hàng Chính Hãng
Máy tính bảng OPPO đời mới
