In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
import warnings
from sklearn.cluster import KMeans
from kneed import KneeLocator
import collections
import auxiliary_functions
import pprint
import json
import random
from itertools import combinations
from itertools import chain

In [2]:
data = pd.read_csv("./data_house/database.csv", sep = ',') 
column_names = data.columns
n = len(data.columns)
print("Dataset shape:", data.shape)

# Generate a random query   
m = random.randint(1,4)
df = data.sample(n = m, axis = 'columns').sample()
row = []
df_fake_queries = pd.DataFrame(index = range(1), columns = column_names)
df_fake_queries.drop(df_fake_queries.columns[df_fake_queries.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

for j in range(len(df.columns)):
    row.append(''.join((str(df.columns[j]),'=',str(df.iloc[0][j]))))
    df_fake_queries[str(df.columns[j])].iloc[0] = df.iloc[0][j]

print('Query: ', row)
print('Dataframe of query: ', df_fake_queries)

Dataset shape: (10000, 13)
Query:  ['price=23', 'windows=13']
Dataframe of query:    nrooms nbedrooms nbath   sm garden_sm floors gargae_sm price year windows  \
0    NaN       NaN   NaN  NaN       NaN    NaN       NaN    23  NaN      13   

  dist_city doors  
0       NaN   NaN  


In [3]:
query = df_fake_queries.dropna(axis = 1)
query_columns = query.columns
query_values = query.values[0]
print(query)

  price windows
0    23      13


In [21]:
# We have to check if the query already exists in our query database
queries =  pd.read_csv("./data_house/queries_labels.csv", sep = ',', index_col = 0)

In [5]:
resData = queries.merge(df_fake_queries, how = 'inner' ,indicator=False)
print("Common rows between two DataFrames...\n",resData)

Common rows between two DataFrames...
 Empty DataFrame
Columns: [query_id, nrooms, nbedrooms, nbath, sm, garden_sm, floors, gargae_sm, price, year, windows, dist_city, doors]
Index: []


In [6]:
comb, length = auxiliary_functions.combination(query_columns)
comb, length

(['price', 'windows', ('price', 'windows')], 3)

In [7]:
dict_query = {}
# We create a dictionary with the possible combinations:
comb, l = auxiliary_functions.combination(query_columns)
for i in range(l):
    dict_query.update({str(comb[i]) : []} )
print(dict_query)   
        

{'price': [], 'windows': [], "('price', 'windows')": []}


In [8]:
# We can look for queries that share some of the values
# We create a dictionary to update the repeated values:
length = len(query_columns)
dict_query = auxiliary_functions.matching_queries(length, query_columns, query, dict_query, queries)

Case 2: up to 2 common value
Dictionary:  {'price': [15, 687, 796, 1094, 1415], 'windows': [58, 70, 74, 390, 553, 765, 888, 1053, 1059, 1111, 1302, 1343, 1362, 1537, 1564, 1800, 1899], "('price', 'windows')": []}


  idx = list(queries[queries[str(query_columns[0])] == query.iloc[0,0]][queries[str(query_columns[1])] == query.iloc[0,1]].index)


In [9]:
# For the queries in the dictionary we create the query set to calculate similaritis later
all_values = dict_query.values()
index_values_dict = list(chain.from_iterable(list(all_values)))
sim_queries = queries.iloc[index_values_dict]
sim_queries

Unnamed: 0,query_id,nrooms,nbedrooms,nbath,sm,garden_sm,floors,gargae_sm,price,year,windows,dist_city,doors
15,15,3.0,,,,,,4.0,23.0,,,,2.0
687,687,,,,,,,,23.0,,,3.0,10.0
796,796,,,2.0,,,,,23.0,,,,2.0
1094,1094,5.0,,,,,3.0,,23.0,16.0,,,
1415,1415,,,,,,,5.0,23.0,16.0,,,
58,58,,,,,4.0,1.0,,6.0,,13.0,,
70,70,,,2.0,,,1.0,,,,13.0,,
74,74,,1.0,,,,,,,,13.0,,
390,390,,3.0,,,,2.0,,,,13.0,,
553,553,,,,,4.0,,,,,13.0,,


In [10]:
dict_queries_sim = auxiliary_functions.queries_as_sets(sim_queries, 'partb_query_set.json')

We can base the importance or relevance of a query in different aspects:
<ul>
    <li>  The ranking of queries with similar value of Jaccard similarity</li>
    <li> The ranking of queries that share some of the condition and their values </li>
    <li> The number of queries that were already posed by the user and have similar conditions </li> 
</ul>

We are now going to try for a certain user, a random one

In [11]:
user_queries =  pd.read_csv("./data_house/user_queries.csv", sep = ',')
user = user_queries.sample(n = 1)
print(user)

        user_id   0     1   2     3   4     5     6     7   8  ...  1990  \
48  773-15-6228 NaN  12.0 NaN  31.0 NaN  92.0  75.0  37.0 NaN  ...  70.0   

    1991  1992  1993  1994  1995  1996  1997  1998  1999  
48  92.0   NaN  23.0   NaN   NaN   NaN   2.0  11.0   NaN  

[1 rows x 2001 columns]


In [12]:
# We look at the originally ranked queries by the user
df_user_no_nan = user.dropna(axis = 1)
user_no_nan = list(df_user_no_nan.columns[1:])
user_nan = list(user.columns[user.isna().any()])
print('Visited queries: ', len(user_no_nan))
print('Not visited queries: ', len(user_nan))

Visited queries:  732
Not visited queries:  1268


In [13]:
similar_visited_queries = []
similar_no_visited_queries = []
for i in range(len(user_no_nan)):
    if user_no_nan[i] in index_values_dict:
        similar_visited_queries.append(user_no_nan[i])

for i in range(len(user_nan)):
    if user_nan[i] in index_values_dict:
        similar_no_visited_queries.append(user_nan[i])   

print(similar_no_visited_queries)  
print(similar_visited_queries)

[]
[]


In [None]:
#completre

In [23]:
gvn_jsonfile = open("./partb_query_set.json")
json_data = json.load(gvn_jsonfile)

gvn_jsonfile = open("./data_house/query_set.json")
json_data_queries = json.load(gvn_jsonfile)

gvn_jsonfile = open("./dict_cluster.json")
dict_cluster = json.load(gvn_jsonfile)


recomendations_index = pd.DataFrame(0, index = range(len(user_queries)), columns =['user_id','top1', 'top2', 'top3', 'top4', 'top5'])
recomendations_value = pd.DataFrame(0, index = range(len(user_queries)), columns =['user_id','top1', 'top2', 'top3', 'top4', 'top5'])
 
 
for item in user_no_nan:
    key = str(queries['kmeans_label_id'].iloc[int(item)])
    set_query = json_data_queries[str(item)]
    
    key = str(queries['kmeans_label_id'].iloc[int(item)])
    similarity = []
    index_top_3 = [0,0,0]
    value_top_3 =[0,0,0]
    
    for i in range(2):
        if sim_queries['query_id'].iloc[i] in dict_cluster[key]:
            set_query_sim = json_data_queries[str(sim_queries['query_id'].iloc[i])]
            similarity_value = auxiliary_functions.jaccard_similarity(set_query_sim, set_query)
            
            if similarity_value > min(value_top_3):
                min_index = value_top_3.index(min(value_top_3))
                index_top_3[min_index] = int(sim_queries['query_id'].iloc[i])
                value_top_3[min_index] = similarity_value 
        
