In [1]:
from similarity_fn import*
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import recordlinkage as rl 
from recordlinkage.index import Block
from recordlinkage.algorithms.distance import _haversine_distance
from recordlinkage.algorithms.numeric import _linear_sim
import inspect
from sklearn.feature_extraction.text import TfidfVectorizer

import re

In [2]:
#Reading Data
df_clean = pd.read_csv(r'C:\Users\Invitado\Documents\Python\DS_MASTER\City_Adam_Intern\df_red.csv')

#There was a duplicated entry in ListingId so delete
df_clean[df_clean.duplicated(subset=['ListingId']) == True]
df_clean.drop(132,axis=0,inplace=True)
df_clean[df_clean['ListingId']==10217568]
#Clean descriptions for tf-idf similarity
df_clean['Descrp_tfidf']= df_clean['Description'].map(clean_tfidf)

#To have tuples with LitingsId
df_tuple = df_clean.set_index('ListingId')

In [3]:
# To compare Harversine
df_clean['Lat_Lng'] =list(zip(df_clean.Lat, df_clean.Lng))

In [4]:
#Blocking of data , feature Neighborhood

indexer = rl.BlockIndex(on='Neighborhood')
candidate_links = indexer.index(df_tuple)

print ('Candidate links:',len(candidate_links))

Candidate links: 3470454


In [5]:
#Further blocking include longitude and latitude
compare_cl = rl.Compare()

compare_cl.exact('Neighborhood','Neighborhood', label='Neighborhood')
compare_cl.geo('Lat','Lng','Lat','Lng',method='linear',label='Distance')

features = compare_cl.compute(candidate_links, df_tuple)
features

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood,Distance
ListingId_1,ListingId_2,Unnamed: 2_level_1,Unnamed: 3_level_1
10025641,10008030,1,0.000000
10027284,10008030,1,0.331520
10027284,10025641,1,0.000000
10043980,10008030,1,0.000000
10043980,10025641,1,0.972364
...,...,...,...
42112141,39267402,1,0.000000
49161472,30077792,1,0.000000
49161472,39267402,1,0.000000
49161472,42112141,1,0.000000


In [6]:
#Harversine formula in record linkgae uses km then we want to check at what values is 150 meters (Airbnb approx ccordinates range)
offset = 0.0
scale = 1.0
origin = 0.0
x= np.float64(0.15)
num_sim_alg = partial(_linear_sim, x,scale,offset,origin)
c = num_sim_alg()
print('Set threshold at this level',c)
#lines = inspect.getsource(_haversine_distance)
#print(lines)  

Set threshold at this level 0.925


In [7]:
#Select possible matches
matches_blocking = features[features['Distance'] > 0.925] # Threshold used 
pairs_tuples =list(matches_blocking.index) #list with all the possible matching pairs 150k
len(matches_blocking)

44337

In [None]:
#Create an matrix with tf-idf vectors 
corpus = list(df_clean['Descrp_tfidf'])
tfidfvectorizer = TfidfVectorizer(analyzer='word')
tfidfvectorizer.fit(corpus)
tfidf_train = tfidfvectorizer.transform(corpus)
tf_idf_matrix = tfidf_train.todense()

In [140]:
# Do not use all features , not links, etc
not_used_var = ['site','host_start_month', 'first_bookable_date','listing_url','Lat','Lng','Description','postalcode',
                'Picture_url','Host_thumbnail_url','Xl_picture_url', 'Picture_url1','Picture_url2', 
            'Picture_url3', 'Picture_url4', 'Picture_url5','Picture_url6']
images = ['Picture_url','Host_thumbnail_url','Xl_picture_url', 'Picture_url1','Picture_url2', 
            'Picture_url3', 'Picture_url4', 'Picture_url5','Picture_url6']
df_cleaned_1 = df_clean.loc[:, ~df_clean.columns.isin(not_used_var)]

def create_vector_embedding(df,tuples):
    
    #Create a dictionary with the types of every column, dont use listing as it will not be used
    types = { column: df[column].dtypes for column in df.columns[1:]}

    #All Similarity metrics that are gonn abe used  
    similarity_metrics={
        'str':[('jaro_winker',get_jaro_winker),('levenshtein_sim',get_levenshtein_sim), ('jaccard_sim',get_jaccard_sim), ('relaxed_jaccard_sim',get_relaxed_jaccard_sim), ('overlap_sim',get_overlap_sim), ('containment_sim',get_containment_sim)],
        'str_un':[('overlap_sim',get_overlap_sim)],
        'geo':[('geo',get_geo)],
        'numeric': [('binary',get_binary), ('numerical',get_numerical)],
        'numeric_un':[('binary',get_binary)],
        'descrip':[('LaBSE_cos',cos_similarity), ('tf_idf_cos',cos_similarity)],
        'postal' :[]
    }
    #Create lists that will be used for creating the dataframe
    columns_names=['ListingId_1','ListingId_2']
    features_row = []
    # There may be tuples (1,2) or (2,1) which are the same and shoul not be consider different so with this list we check for this 
    duplicates = []
    
    #Columns which are categorical or binary threfore it makes sense to only use 1 if the same 0 if not
    cat_str = ['Neighborhood','Room_type_category', 'Room_type', 'Space_type']
    bin_int = ['Instant_bookable','Is_new_listing','kpi_capacity_more_than_4','kpi_capacity_more_than_6','kpi_host_lists_multiple',
                'kpi_host_lists_10_or_more','kpi_instant_bookable','host_start_year','Property_type_id','Host_is_superhost',
                'Host_has_profile_pic']


    # Create the column names for the new daraframe
    for column,type in types.items():
        # (*) Check below for explanation the same just with the column names 
        if column == 'Lat_Lng':
            for similarity_fn in similarity_metrics['geo']:
                columns_names.append(column + '_'+ str(similarity_fn[0]))
        elif column == 'Descrp_tfidf':
            columns_names.append(column + '_'+ str(similarity_metrics['descrip'][1][0]))
        elif column == 'LaBSE_emb':
            columns_names.append(column + '_'+ str(similarity_metrics['descrip'][0][0]))
        elif type == object and column not in cat_str :
            for similarity_fn in similarity_metrics['str']:
                columns_names.append(column + '_'+ str(similarity_fn[0]))
        elif type == object and column in cat_str :
            for similarity_fn in similarity_metrics['str_un']:
                columns_names.append(column + '_'+ str(similarity_fn[0]))
        elif (type == np.float64 or type == np.int64) and column not in bin_int:
            for similarity_fn in similarity_metrics['numeric']:
                columns_names.append(column + '_'+ str(similarity_fn[0]))
        elif (type == np.float64 or type == np.int64) and column in bin_int:
            for similarity_fn in similarity_metrics['numeric_un']:
                columns_names.append(column + '_'+ str(similarity_fn[0]))
    
    
    #Go trough the listing in pool of possible matches 
    for listing_1,listing_2 in tuples:
    
        #Check if the listings have already been added (1,2) or (2,1) are the same
        unique_term = (listing_1*listing_2 + (listing_1+listing_2))
        if unique_term not in duplicates:
            duplicates.append(unique_term)

            #List for each row 
            listing_row= []
            listing_row.append(listing_1)
            listing_row.append(listing_2)

            #Make the entries into one array which has both values of a column  like [1,2]
            a= df[df['ListingId']==listing_1].values
            b= df[df['ListingId']==listing_2].values
            join= np.concatenate((a, b), axis=0).T
            
            # The step will help finding the desired values in the array 
            step =1
            
            #loop through the columns to create the array with the similarity score of each function
            for column,type in types.items():
                # (*) Column which is LatLng in format (4.566,78,9228) needs to be accesed for both lat and lng 
                if column == 'Lat_Lng':
                    for similarity_fn in similarity_metrics['geo']:
                        listing_row.append(similarity_fn[1](join[step][0][0],join[step][0][1],join[step][1][0],join[step][1][1]))
                elif column == 'Descrp_tfidf':
                    #Get the index of the listings, then retreive the array created by tfidf vectorizer
                    idx_1 = df_clean.index[df_clean['ListingId'] == join[0][0]][0]
                    idx_2 = df_clean.index[df_clean['ListingId'] == join[0][1]][0]
                    listing_1, listing_2 = torch.Tensor(tf_idf_matrix[idx_1]),torch.Tensor(tf_idf_matrix[idx_2])
                    listing_row.append(similarity_metrics['descrip'][1][1](listing_1,listing_2))
                elif column == 'LaBSE_emb':
                    listing_1 = transform_torch(join[step][0])
                    listing_2 = transform_torch(join[step][1])
                    listing_row.append(similarity_metrics['descrip'][0][1](listing_1,listing_2,norm=True))
                # Check if it is an str and not a categorical string 
                elif type == object and column not in cat_str :
                    for similarity_fn in similarity_metrics['str']:
                        listing_row.append(similarity_fn[1](join[step][0],join[step][1]))
                # Check if it is an str and a categorical string 
                elif type == object and column in cat_str :
                    for similarity_fn in similarity_metrics['str_un']:
                        listing_row.append(similarity_fn[1](join[step][0],join[step][1]))
                # Check if it is a number  and not a binary 
                elif (type == np.float64 or type == np.int64) and column not in bin_int :
                    for similarity_fn in similarity_metrics['numeric']:
                        listing_row.append(similarity_fn[1](join[step][0],join[step][1]))
                # Check if it is a number  and  a binary 
                elif (type == np.float64 or type == np.int64) and column in bin_int:
                    for similarity_fn in similarity_metrics['numeric_un']:
                        listing_row.append(similarity_fn[1](join[step][0],join[step][1]))
                        
                step +=1
            features_row.append(listing_row)
        
        break
    #Create DataFrame
    df_embeddings = pd.DataFrame(features_row, columns = columns_names)

    return df_embeddings

df_sim = create_vector_embedding(df_cleaned_1,pairs_tuples)

In [141]:
df_sim

Unnamed: 0,ListingId_1,ListingId_2,Name_jaro_winker,Name_levenshtein_sim,Name_jaccard_sim,Name_relaxed_jaccard_sim,Name_overlap_sim,Name_containment_sim,Bedrooms_binary,Bedrooms_numerical,...,Host_has_profile_pic_binary,Host_is_superhost_binary,Property_type_id_binary,Reviews_count_binary,Reviews_count_numerical,Star_rating_binary,Star_rating_numerical,LaBSE_emb_LaBSE_cos,Lat_Lng_geo,Descrp_tfidf_tf_idf_cos
0,10043980,10025641,0.976471,0.941176,0.714286,0.714286,0.0,0.833333,1.0,1.0,...,1.0,1.0,1.0,0.0,0.116162,1.0,1.0,0.936309,0.972364,0.988256


In [10]:
df_sim.to_csv('df_sim.csv',index=False)

In [11]:
df_sim.to_pickle('df_sim.pkl')

In [12]:
df = pd.read_pickle('df_sim.pkl')
df

Unnamed: 0,ListingId_1,ListingId_2,Name_jaro_winker,Name_levenshtein_sim,Name_jaccard_sim,Name_relaxed_jaccard_sim,Name_overlap_sim,Name_containment_sim,postalcode_overlap_sim,Bedrooms_binary,...,Picture_count_binary,Picture_count_numerical,Host_has_profile_pic_binary,Host_is_superhost_binary,Property_type_id_binary,Reviews_count_binary,Reviews_count_numerical,Star_rating_binary,Star_rating_numerical,Lat_Lng_geo
0,10161202,10161074,0.988235,0.970588,0.666667,1.000000,0.0,0.8,-1.0,1.0,...,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,0.991550
1,10182011,10044434,0.988235,0.970588,0.666667,1.000000,0.0,0.8,-1.0,1.0,...,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,0.964122
2,10182011,10161074,0.976471,0.941176,0.666667,0.666667,0.0,0.8,-1.0,1.0,...,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,0.980465
3,10182011,10161202,0.976471,0.941176,0.666667,0.666667,0.0,0.8,-1.0,1.0,...,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,0.972735
4,10182371,10182237,0.976471,0.941176,0.666667,0.666667,0.0,0.8,-1.0,1.0,...,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,0.956225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150593,350271,30198278,0.365293,0.138889,0.000000,0.000000,0.0,0.0,1.0,0.0,...,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,0.970741
150594,43919608,350271,0.409594,0.083333,0.000000,0.000000,0.0,0.0,1.0,1.0,...,0.0,0.013390,1.0,1.0,0.0,0.0,0.0,0.0,0.393939,0.956785
150595,45237781,29271750,0.470947,0.163265,0.000000,0.000000,0.0,0.0,1.0,1.0,...,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,0.986317
150596,45237781,30198278,0.493056,0.222222,0.000000,0.000000,0.0,0.0,1.0,1.0,...,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,0.971189
