In [4]:
%cd ..
%matplotlib inline

C:\Users\usuario\Desktop\New_Work


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import unicodedata

In [31]:
from src.common_paths import get_data_path, get_output_path
from src.common_paths import get_data_path, get_output_path
from src.utilities import *

In [22]:
symbols_to_space = re.compile(u"[/\|\n(\; )|(\: )|( \()|(\) )|( \")|(\" )|( \')|(\' )]")
symbols_to_remove = re.compile(u"[\"\'\$\€\£\(\)\:\[\]\.\,\-]")
space_repetition = re.compile(u" {2,}")
key_words_to_remove = re.compile(u"gmbh")

def canonize_language(df, text_var):
    text_var_clean= text_var+'_clean'
    cleaned_var=df[text_var].apply(lambda x: re.sub(symbols_to_remove, "", x))
    cleaned_var=cleaned_var.apply(lambda x: x.lower())
    cleaned_var=cleaned_var.apply(lambda x: re.sub(key_words_to_remove, "", x))
    cleaned_var=cleaned_var.apply(lambda x: unicodedata.normalize('NFKD', x))
    cleaned_var=cleaned_var.apply(lambda x: re.sub(space_repetition, "", x))
    cleaned_var=cleaned_var.apply(lambda x: str.strip(x))
    cleaned_var=cleaned_var.apply(lambda x: re.sub(u"\s", "_", x))
    return cleaned_var


def match_checker(df1, df2, key, aux_how):
    df2["aux"] = 1
    aux_df = pd.merge(df1, df2, on = key, how = aux_how)
    aux_df["aux"] = aux_df.aux.fillna(0)
    num_ids_match= np.sum(aux_df.aux)
    print("{} var has {} unique values".format(key, len(np.unique(df1[key]))))
    print("{} var has {} unique values".format(key, len(np.unique(df2[key]))))
    print("Matches {} id rows of entities in profiles".format(num_ids_match))
    pass



def direct_matcher(df1, df2, key, aux_how):
    aux_df = pd.merge(df1, df2, on = key, how = aux_how)
    aux_df=aux_df[["id_x", "id_y"]]
    aux_df.rename(columns={"id_y": "id_profiles", "id_x": "id_entities"}, inplace=True)
    return(aux_df[["id_profiles", "id_entities"]])


def eval_fun(profile_ids, aux_vars_ids, matched_df):
    # The evaluation will count if all of the profiles have at least 1 entity
    # Global and local mean of % matching entitites with profile.
    gt_df = pd.read_csv(os.path.join(get_data_path(), "ground_truth.tsv"), 
                                names=aux_vars_ids,
                                encoding="utf-8", 
                                sep="\t") # .fillna({"text": "empty"})


    # First check: ¿At least all the profiles have one entity?
    at_least_all=len(np.unique(gt_df[aux_vars_ids[0]])) == len(np.unique(matched_df[aux_vars_ids[0]]))
    if at_least_all:
        print("1. All profiles have at least one entity")
    else:
        print("1. Not all profiles have one entity")
    # Second is to measure the percentages:
    res_df = pd.DataFrame() # DF with a tuple [a, b, c] where a= # of entitites assigned, b= # of real assigned, c= #matches 
    for i_id in profile_ids:
        entities_assigned=matched_df.loc[matched_df["id_profiles"]==i_id].id_entities.values
        entities_assigned = entities_assigned[~np.isnan(entities_assigned)]
        num_assigned= len(entities_assigned)
        real_entities_assigned=gt_df.loc[gt_df["id_profiles"]==i_id].id_entities.values
        real_entities_assigned = real_entities_assigned[~np.isnan(real_entities_assigned)]
        real_num_assigned=len(real_entities_assigned)
        num_matches=len(set(entities_assigned) & set(real_entities_assigned))
        per_match= round(num_matches/real_num_assigned, 2)*100
        aux_tuple=[i_id, num_assigned, real_num_assigned, per_match]
        aux_df=pd.DataFrame([aux_tuple], columns=["id_profile", "num_entities_matched", "real_entitites_matched", "per_match"])
        res_df=res_df.append(aux_df) 
    return(res_df)

In [23]:
entities_df = pd.read_csv(os.path.join(get_output_path(), "enti_data.csv"), 
                            encoding="utf-8", 
                            sep=";") # .fillna({"text": "empty"})

profiles_df = pd.read_csv(os.path.join(get_output_path(), "prof_data.csv"), 
                            encoding="utf-8", 
                            sep=";") # .fillna({"text": "empty"})

In [24]:
# Standarized ID company_name 
entities_df['company_name_clean']=canonize_language(df=entities_df, text_var="company_name")
profiles_df['company_name_clean']=canonize_language(df=profiles_df, text_var="company_name")

In [25]:
# How improves the join based on the cleaning variable:
print("-------- Before --------")
match_checker(df1=entities_df, df2=profiles_df, key="company_name", aux_how="right")
print("-------- Now --------")
match_checker(df1=entities_df, df2=profiles_df, key="company_name_clean", aux_how="right")

-------- Before --------
company_name var has 48311 unique values
company_name var has 10000 unique values
Matches 11013 id rows of entities in profiles
-------- Now --------
company_name_clean var has 48163 unique values
company_name_clean var has 9984 unique values
Matches 11296 id rows of entities in profiles


In [26]:
# There is only an improvement of 3% and its completely unsignificant.
round(((11296-11013)/11013), 2)

0.03

In [27]:
# Do the mathing and measure performance:
matched_df=direct_matcher(df1=entities_df, df2=profiles_df, key="company_name_clean", aux_how="right")
print("Profiles={}".format(len(np.unique(matched_df["id_profiles"]))))
print("Entities={}".format(len(np.unique(matched_df["id_entities"]))))
print("Dimension={}".format(matched_df.shape))

Profiles=10000
Entities=11265
Dimension=(11296, 2)


In [29]:
profile_ids=np.unique(profiles_df.id)
aux_vars_ids = ["id_profiles", "id_entities"] #First has to be the profiles id and the second the entitites id name
result_proto1_df=eval_fun(profile_ids, aux_vars_ids, matched_df)

1. All profiles have at least one entity


In [30]:
print("--Recall--")# From the true positives - how many positives
print("Mean average of entity match {} ".format(np.mean(result_proto1_df["per_match"])))
print("Median average of entity match {} ".format(np.median(result_proto1_df["per_match"])))
result_proto1_df.describe()

--Recall--
Mean average of entity match 60.1314 
Median average of entity match 100.0 


Unnamed: 0,id_profile,num_entities_matched,real_entitites_matched,per_match
count,10000.0,10000.0,10000.0,10000.0
mean,425235.2,0.7564,1.4799,60.1314
std,484765.0,1.243066,2.682781,47.897753
min,403.0,0.0,1.0,0.0
25%,66235.0,0.0,1.0,0.0
50%,191056.0,1.0,1.0,100.0
75%,804856.0,1.0,1.0,100.0
max,1868021.0,45.0,89.0,100.0


In [None]:
#Save matches
matched_df.to_csv(os.path.join(get_output_path(), "results_proto_1.csv"), sep=";", index=False)