In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
import warnings
from sklearn.cluster import KMeans
from kneed import KneeLocator
import collections
import auxiliary_functions
import pprint
import json
import random
from itertools import combinations
from itertools import chain
from sklearn.preprocessing import MinMaxScaler
from statistics import mean 

In [2]:
data = pd.read_csv("./data_house/database.csv", sep = ',') 
column_names = data.columns
n = len(data.columns)
print("Dataset shape:", data.shape)

# Generate a random query   
m = random.randint(2,4)
df = data.sample(n = m, axis = 'columns').sample()
row = []
df_fake_queries = pd.DataFrame(index = range(1), columns = column_names)
df_fake_queries.drop(df_fake_queries.columns[df_fake_queries.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

for j in range(len(df.columns)):
    row.append(''.join((str(df.columns[j]),'=',str(df.iloc[0][j]))))
    df_fake_queries[str(df.columns[j])].iloc[0] = df.iloc[0][j]

print('Query: ', row)

Dataset shape: (10000, 13)
Query:  ['dist_city=7', 'nrooms=1']


In [3]:
query = df_fake_queries.dropna(axis = 1)
query_columns = query.columns
query_values = query.values[0]
random_query = df_fake_queries
random_query.insert(0,'query_id',10000)
random_query

Unnamed: 0,query_id,nrooms,nbedrooms,nbath,sm,garden_sm,floors,gargae_sm,price,year,windows,dist_city,doors
0,10000,1,,,,,,,,,,7,


In [4]:
# We have to check if the query already exists in our query database
queries_temp =  pd.read_csv("./data_house/queries_labels.csv", sep = ',', index_col = 0)
queries =  pd.read_csv("./data_house/queries_labels.csv", sep = ',')
resData = queries.merge(df_fake_queries, how = 'inner' , on=['nrooms','nbedrooms','nbath','sm','garden_sm','floors','gargae_sm','price','year','windows','dist_city','doors'])
if resData.empty:
    print('The query does not exist in the database')
    case = 1
else:
    print('The query already exists')
    case = 0

The query does not exist in the database


If the query exists in the dataframe, we can simply base the importance of that query in the ranking that we have provided in part A. In case it does not exist, we cna look for queries with common conditions and find the similarity between these found queries and the top 5 ranked queries for each user, ranked by themselves and not "by us" in part A.

In [5]:
user_queries =  pd.read_csv("./data_house/user_queries.csv", sep = ',')
user_queries_fill = pd.read_csv("./data_house/user_queries_fill.csv", sep = ',')

In [6]:
seq = user_queries_fill['user_id']

In [7]:
x = user_queries_fill.iloc[:,2:].values #returns a numpy array
min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x.transpose())
user_queries_fill_minmax = pd.DataFrame(x_scaled.transpose()*100)

user_queries_fill_minmax.insert(0,'user_id',seq)

In [8]:
if case == 0:
    importance = pd.DataFrame(0, index = range(len(user_queries)), columns =['user_id','importance_value'])
    resData_idx = resData['query_id_x'].tolist()
    for i in range(len(user_queries)):
        rank = []
        for j in range(len(resData_idx)):
            rank.append(user_queries_fill_minmax.loc[i,str(j)])
            
        importance['user_id'].iloc[i] = user_queries['user_id'].iloc[i]
        importance['importance_value'].iloc[i] = np.mean(rank)

    print(importance[:5])

In [9]:
random_query

Unnamed: 0,query_id,nrooms,nbedrooms,nbath,sm,garden_sm,floors,gargae_sm,price,year,windows,dist_city,doors
0,10000,1,,,,,,,,,,7,


In [15]:
if case == 1:
    importance = pd.DataFrame(0, index = range(len(user_queries)), columns =['user_id','importance_value'])
    dict_query = {}
    # We create a dictionary with the possible combinations:
    comb, l = auxiliary_functions.combination(query_columns)
    for i in range(l):
        dict_query.update({str(comb[i]) : []} )
    length = len(query_columns)
    dict_query = auxiliary_functions.matching_queries(length, query_columns, query, dict_query, queries)
    
    # For the queries in the dictionary we create the query set to calculate similaritis later
    all_values = dict_query.values()
    index_values_dict = list(chain.from_iterable(list(all_values)))
    sim_queries = queries.iloc[index_values_dict]
    
    # We can calculate the similarity between the original query and the ones that share some conditions
    random_query_set = list(auxiliary_functions.queries_as_sets(random_query, filename='random_query.json').values())[0]
    print(random_query_set)
    similarity_value_query = pd.DataFrame(0, index = range(len(sim_queries)), columns = ['query_id', 'sim_value'])
    similarity_value_query['query_id'] = sim_queries['query_id'].tolist()
    similarity = []

    for query_id in sim_queries['query_id']:
        gvn_jsonfile = open("query_set.json")
        json_data = json.load(gvn_jsonfile)
        set_query = json_data[str(query_id)]
        
        similarity.append(auxiliary_functions.jaccard_similarity(random_query_set, set_query))

    similarity_value_query['sim_value'] = similarity
    
    imp = []
    for i in range(len(user_queries)):
        gvn_jsonfile = open("query_set.json")
        json_data = json.load(gvn_jsonfile)

        print("---------------user {}------------\n ".format(i+1))
        dict_cluster_sim = {}
        user_queries_non_nan = []
        user_queries_non_nan_rank = []
        user_queries_nan = []
        
        # We create lists containing the indexes of no ranked queries and ranked queries
        for t,j in user_queries.iloc[i][1:].items():           
            if (np.isnan(j)):
                user_queries_nan.append(t)
            else:
                user_queries_non_nan.append(t)
                user_queries_non_nan_rank.append(round(user_queries_fill_minmax.iloc[i][int(t)],2))
        n_nan_queries = len(user_queries_nan)
        
        # We look at the 5 highest ranked queries
        top_5_index_queries = auxiliary_functions.sort_by_indexes(user_queries_non_nan, user_queries_non_nan_rank, True)[:5]
        top_5_rank_queries = auxiliary_functions.find_highest_values(user_queries_non_nan_rank, ordered_nums_to_return=5)
        #print('Top 5 ranked queries: ', top_5_index_queries)
        #print('Rank of top 5 ranked queries (normalized): ',top_5_rank_queries )
                
        #print('Visited queries: ', len(user_queries_non_nan))
        #print('Not visited queries: ', len(user_queries_nan))

        # Create a dictionary
        for j in range(len(np.unique(queries['kmeans_label_id']))):
            dict_cluster_sim.update({str(np.unique(queries['kmeans_label_id'])[j]) : []})
        
        for k in range(len(sim_queries)):
            dict_cluster_sim[str(queries['kmeans_label_id'].iloc[k])].append(sim_queries['query_id'].iloc[k])

        index_top_sim = [0,0,0,0,0]
        value_top_sim = [0,0,0,0,0]
        sim_item_original = []
        for t, item in enumerate(top_5_index_queries):
            set_query_non_nan = json_data[str(item)]
            key = str(queries['kmeans_label_id'].iloc[int(item)]) 
            # We calculate  the similarity between the top ranked and the original query
            sim_item_original.append(round(auxiliary_functions.jaccard_similarity(set_query_non_nan, random_query_set),2))
            
            sim_value = 0
            for query_id in dict_cluster_sim[key]:
                gvn_jsonfile = open("query_set.json")
                json_data = json.load(gvn_jsonfile)
                set_query = json_data[str(query_id)]
                similarity_value = auxiliary_functions.jaccard_similarity(set_query_non_nan, set_query)
                
                if similarity_value > sim_value:
                    sim_value = round(similarity_value,2)
                    value_top_sim[t] = sim_value
                    index_top_sim[t] = query_id

    
        #print('Most similar queries to each of the top ranked: ', index_top_sim)
        #print('Similarity of most similar queries to each of the top ranked: ', value_top_sim)
        #print('Similarity of top ranked by user and original query: ', sim_item_original)
        
        similarity_query_random = []
        for l in range(len(index_top_sim)):
            similarity_query_random.append(round(similarity_value_query['sim_value'][similarity_value_query['query_id'] ==index_top_sim[l]].tolist()[0],2))
            
        print('similarity of most similar wrt original query: ', similarity_query_random)
        
        weights = [round(i*j,2) for i,j in zip(similarity_query_random, value_top_sim)]
        print(weights)
        
        importance_user = mean([i*j for i,j in zip(top_5_rank_queries, weights)])
        imp.append(importance_user)
        #print('imortance of the query: ', importance_user)
    
    importance['user_id'] = list(user_queries['user_id'])
    importance['importance_value'] = imp
    

Case 2: up to 2 common value
['nrooms', 'dist_city', 'dist_city', 'dist_city', 'dist_city', 'dist_city', 'dist_city', 'dist_city']


  idx = list(queries[queries[str(query_columns[0])] == query.iloc[0,0]][queries[str(query_columns[1])] == query.iloc[0,1]].index)


---------------user 1------------
 
Top 5 ranked queries:  ['18', '481', '629', '647', '885']
Rank of top 5 ranked queries (normalized):  [100.0, 98.55, 97.1, 95.65, 94.2]
Most similar queries to each of the top ranked:  [1258, 667, 1237, 441, 1352]
Similarity of most similar queries to each of the top ranked:  [0.67, 0.46, 0.9, 0.38, 0.43]
Similarity of top ranked by user and original query:  [0.14, 0.02, 0.0, 0.0, 0.0]
similarity of most similar wrt original query:  [0.02, 0.02, 0.04, 0.57, 0.07]
[0.01, 0.01, 0.04, 0.22, 0.03]
---------------user 2------------
 
Top 5 ranked queries:  ['26', '169', '206', '324', '388']
Rank of top 5 ranked queries (normalized):  [100.0, 97.44, 94.87, 92.31, 89.74]
Most similar queries to each of the top ranked:  [984, 584, 984, 1338, 584]
Similarity of most similar queries to each of the top ranked:  [0.36, 0.28, 0.5, 0.78, 0.44]
Similarity of top ranked by user and original query:  [0.0, 0.0, 0.0, 0.0, 0.0]
similarity of most similar wrt original qu

In [17]:
importance

Unnamed: 0,user_id,importance_value
0,158-86-3113,5.94770
1,241-10-5564,14.63620
2,338-27-8772,1.31282
3,495-50-0640,4.68204
4,710-35-1509,9.28992
...,...,...
95,869-47-5400,3.06156
96,899-29-8267,6.08780
97,370-58-6360,1.14872
98,757-91-9133,4.05008


We can base the importance or relevance of a query in different aspects:
<ul>
    <li>  The ranking of queries with similar value of Jaccard similarity</li>
    <li> The ranking of queries that share some of the condition and their values </li>
    <li> The number of queries that were already posed by the user and have similar conditions </li> 
</ul>