In [1]:
%cd ..
%matplotlib inline

C:\Users\usuario\Desktop\New_Work


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import unicodedata
import gensim
from scipy.spatial.distance import cdist

In [3]:
from src.common_paths import get_data_path, get_output_path
from src.utilities import *

In [48]:

def canonize_language_2(df, text_var):
    symbols_to_space = re.compile(u"[/\|\n(\; )|(\: )|( \()|(\) )|( \")|(\" )|( \')|(\' )]")
    symbols_to_remove = re.compile(u"[\"\'\$\€\£\(\)\:\[\]\.\,\-]\&")
    space_repetition = re.compile(u" {2,}")
    key_words_to_remove = re.compile(u"gmbh")
    cleaned_var=df[text_var].apply(lambda x: re.sub(symbols_to_remove, "", x))
    cleaned_var=df[text_var].apply(lambda x: re.sub("GmbH|gmbh|mbH|mbh|&|und|\-|an|der|co|Co|\.|\-", "", x))
    cleaned_var=cleaned_var.apply(lambda x: re.sub(space_repetition, "", x))
    cleaned_var=cleaned_var.apply(lambda x: str.strip(x))
    return cleaned_var


def load_german_model():
    return(gensim.models.KeyedVectors.load_word2vec_format(os.path.join(get_data_path(), "german.model") ,binary=True))

def put_space(aux_input): 
    words = re.findall('[A-Z][a-z]*', aux_input) 
    result = [] 
    for word in words: 
        word = chr( ord (word[0]) + 32) + word[1:] 
        result.append(word) 
    return(result)

def remove_word_elem(aux_input):
    result=[i_element for i_element in aux_input if len(i_element)>1]
    return(result)
    
def convert_list_to_string(org_list, seperator=' '):
    """ Convert list to string, by joining all item in list with given separator.
        Returns the concatenated string """
    return seperator.join(org_list)


# Generate annotation weight representation based on german pre-trained word2vec model 
def annotation_weight_representation(value_annotation):
    value_vectors = []
    count_model_included = 0
    count_model_nonincluded = 0
    idx_token_vectors = 0
    tags = value_annotation.split()
    word_vectors = np.empty(shape=(len(tags), 300))
    idx_word_vectors = 0
    for tag in tags:
        tag = tag.replace('_', ' ')
        if tag in model.vocab:
            word_vectors[idx_word_vectors] = model[tag]
            idx_word_vectors += 1
        else:
            tokens = tag.split()
            token_vectors = np.empty(shape=(len(tokens), 300))
            idx_token_vectors = 0
            for token in tokens:
                if token in model.vocab:
                    token_vectors[idx_token_vectors] = model[token]
                    idx_token_vectors += 1
                else:
                    continue
            if idx_token_vectors > 0:
                word_vectors[idx_word_vectors] = np.average(token_vectors[:idx_token_vectors], axis=0)
                idx_word_vectors += 1
    
    if idx_word_vectors != 0 or idx_token_vectors != 0:
        count_model_included += 1
        value_vectors.append(np.average(word_vectors[:idx_word_vectors], axis=0))
    else:
        count_model_nonincluded += 1
        value_vectors.append(np.nan)

    return value_vectors[0]


def eval_fun(profile_ids, aux_vars_ids, matched_df):
    # The evaluation will count if all of the profiles have at least 1 entity
    # Global and local mean of % matching entitites with profile.
    gt_df = pd.read_csv(os.path.join(get_data_path(), "ground_truth.tsv"), 
                                names=aux_vars_ids,
                                encoding="utf-8", 
                                sep="\t") # .fillna({"text": "empty"})


    # First check: ¿At least all the profiles have one entity?
    at_least_all=len(np.unique(gt_df[aux_vars_ids[0]])) == len(np.unique(matched_df[aux_vars_ids[0]]))
    if at_least_all:
        print("1. All profiles have at least one entity")
    else:
        print("1. Not all profiles have one entity")
    # Second is to measure the percentages:
    res_df = pd.DataFrame() # DF with a tuple [a, b, c] where a= # of entitites assigned, b= # of real assigned, c= #matches 
    for i_id in profile_ids:
        entities_assigned=matched_df.loc[matched_df["id_profiles"]==i_id].id_entities.values
        entities_assigned = entities_assigned[~np.isnan(np.array(entities_assigned, dtype=np.float64))]
        num_assigned= len(entities_assigned)
        real_entities_assigned=gt_df.loc[gt_df["id_profiles"]==i_id].id_entities.values
        real_entities_assigned = real_entities_assigned[~np.isnan(np.array(real_entities_assigned, dtype=np.float64))]
        real_num_assigned=len(real_entities_assigned)
        num_matches=len(set(entities_assigned) & set(real_entities_assigned))
        per_match= round(num_matches/real_num_assigned, 2)*100
        aux_tuple=[i_id, num_assigned, real_num_assigned, per_match]
        aux_df=pd.DataFrame([aux_tuple], columns=["id_profile", "num_entities_matched", "real_entitites_matched", "per_match"])
        res_df=res_df.append(aux_df) 
    return(res_df)


In [6]:
entities_df = pd.read_csv(os.path.join(get_output_path(), "enti_data.csv"), 
                            encoding="utf-8", 
                            sep=";") # .fillna({"text": "empty"})

profiles_df = pd.read_csv(os.path.join(get_output_path(), "prof_data.csv"), 
                            encoding="utf-8", 
                            sep=";") # .fillna({"text": "empty"})

In [73]:
#We have categorical variables as information to infer one profile to different entitites:
#- We could map categorical variables to one hot vectors for each variable but there are variables 
#  (like company_name) that has lot of unique values. And we would over dimension our problem to then apply
#  dimmensionality reduction , i.e PCA or SVD...
#- Will convert categories in pre-trained embeddings, using a german word2vec model 
#- Then summarize each profile and entity as the weighted avg of those embeddings.
#- Will measure similarity of each profile VS all entities.(Cosine or Mahalanobis distance)
#- Assign entities to that profile if similarity result > param_gamma within top X

In [7]:
#Create annotations column for each table
#Entitites
entities_df["annotations"]=canonize_language_2(df=entities_df, text_var="company_name")
entities_df["annotations"]=entities_df["annotations"].apply(lambda x: put_space(x))
entities_df["annotations"]=entities_df["annotations"].apply(lambda x: remove_word_elem(x))
entities_df["annotations"]=entities_df["annotations"].apply(lambda x: convert_list_to_string(x))
entities_df["annotations"]=entities_df["annotations"]+' '+entities_df["city"]+' '+entities_df["country"]+' '+entities_df["foundation_year_cat"]
#Profiles
profiles_df["annotations"]=canonize_language_2(df=profiles_df, text_var="company_name")
profiles_df["annotations"]=profiles_df["annotations"].apply(lambda x: put_space(x))
profiles_df["annotations"]=profiles_df["annotations"].apply(lambda x: remove_word_elem(x))
profiles_df["annotations"]=profiles_df["annotations"].apply(lambda x: convert_list_to_string(x))
profiles_df["annotations"]=profiles_df["annotations"]+' '+profiles_df["city"]+' '+profiles_df["country"]+' '+profiles_df["foundation_year_cat"]

In [8]:
#Word2vec based dimensionality reduction
model=load_german_model()
profiles_df['vector_rep'] = profiles_df['annotations'].apply(lambda x: annotation_weight_representation(x))
entities_df['vector_rep'] = entities_df['annotations'].apply(lambda x: annotation_weight_representation(x))
#profiles_df = profiles_df.dropna(subset=['vector_rep'], how='any')
#entities_df = entities_df.dropna(subset=['vector_rep'], how='any')

In [9]:
#In matrix format and adding 0 in case there would be NaN´s:
profiles_mtx = np.matrix(profiles_df['vector_rep'].tolist())
entities_mtx = np.matrix(entities_df['vector_rep'].tolist())
profiles_mtx[np.isnan(profiles_mtx)] = 0
entities_mtx[np.isnan(entities_mtx)] = 0

In [67]:
%%time 
# Calculate distance: [takes for the 10k profiles ~10minutes]
gamma= 0.60 #should be optimized (ie different trial and error)
n=10 #Top 5 to avoid fail matches
result=pd.DataFrame()
for i in np.arange(0, profiles_mtx.shape[0]):
    print(i)
    XA=profiles_mtx[i]
    cosine_distances=cdist(entities_mtx, XA, 'cosine')
    cosine_distances_df=pd.DataFrame(cosine_distances, columns=["distance"])
    aux_df=pd.merge(entities_df, cosine_distances_df, left_index=True, right_index=True)
    aux_df=aux_df[aux_df.distance>=gamma]
    aux_df=aux_df.sort_values(by='distance', ascending=False)
    assigned_entitites_id=list(aux_df.sort_values(by='distance', ascending=False)[0:n].id)
    aux_df=pd.DataFrame(assigned_entitites_id, columns=["id_entities"])
    aux_df["id_profiles"]=profiles_df.iloc[i].id
    result=result.append(aux_df)
    print("DONE")

0
DONE
1
DONE
2
DONE
3
DONE
4
DONE
5
DONE
6
DONE
7
DONE
8
DONE
9
DONE
10
DONE
11
DONE
12
DONE
13
DONE
14
DONE
15
DONE
16
DONE
17
DONE
18
DONE
19
DONE
20
DONE
21
DONE
22
DONE
23
DONE
24
DONE
25
DONE
26
DONE
27
DONE
28
DONE
29
DONE
30
DONE
31
DONE
32
DONE
33
DONE
34
DONE
35
DONE
36
DONE
37
DONE
38
DONE
39
DONE
40
DONE
41
DONE
42
DONE
43
DONE
44
DONE
45
DONE
46
DONE
47
DONE
48
DONE
49
DONE
50
DONE
51
DONE
52
DONE
53
DONE
54
DONE
55
DONE
56
DONE
57
DONE
58
DONE
59
DONE
60
DONE
61
DONE
62
DONE
63
DONE
64
DONE
65
DONE
66
DONE
67
DONE
68
DONE
69
DONE
70
DONE
71
DONE
72
DONE
73
DONE
74
DONE
75
DONE
76
DONE
77
DONE
78
DONE
79
DONE
80
DONE
81
DONE
82
DONE
83
DONE
84
DONE
85
DONE
86
DONE
87
DONE
88
DONE
89
DONE
90
DONE
91
DONE
92
DONE
93
DONE
94
DONE
95
DONE
96
DONE
97
DONE
98
DONE
99
DONE
100
DONE
101
DONE
102
DONE
103
DONE
104
DONE
105
DONE
106
DONE
107
DONE
108
DONE
109
DONE
110
DONE
111
DONE
112
DONE
113
DONE
114
DONE
115
DONE
116
DONE
117
DONE
118
DONE
119
DONE
120
DONE
121
DONE
122
DONE
123

DONE
924
DONE
925
DONE
926
DONE
927
DONE
928
DONE
929
DONE
930
DONE
931
DONE
932
DONE
933
DONE
934
DONE
935
DONE
936
DONE
937
DONE
938
DONE
939
DONE
940
DONE
941
DONE
942
DONE
943
DONE
944
DONE
945
DONE
946
DONE
947
DONE
948
DONE
949
DONE
950
DONE
951
DONE
952
DONE
953
DONE
954
DONE
955
DONE
956
DONE
957
DONE
958
DONE
959
DONE
960
DONE
961
DONE
962
DONE
963
DONE
964
DONE
965
DONE
966
DONE
967
DONE
968
DONE
969
DONE
970
DONE
971
DONE
972
DONE
973
DONE
974
DONE
975
DONE
976
DONE
977
DONE
978
DONE
979
DONE
980
DONE
981
DONE
982
DONE
983
DONE
984
DONE
985
DONE
986
DONE
987
DONE
988
DONE
989
DONE
990
DONE
991
DONE
992
DONE
993
DONE
994
DONE
995
DONE
996
DONE
997
DONE
998
DONE
999
DONE
1000
DONE
1001
DONE
1002
DONE
1003
DONE
1004
DONE
1005
DONE
1006
DONE
1007
DONE
1008
DONE
1009
DONE
1010
DONE
1011
DONE
1012
DONE
1013
DONE
1014
DONE
1015
DONE
1016
DONE
1017
DONE
1018
DONE
1019
DONE
1020
DONE
1021
DONE
1022
DONE
1023
DONE
1024
DONE
1025
DONE
1026
DONE
1027
DONE
1028
DONE
1029
DONE
1030
DONE
1

DONE
1753
DONE
1754
DONE
1755
DONE
1756
DONE
1757
DONE
1758
DONE
1759
DONE
1760
DONE
1761
DONE
1762
DONE
1763
DONE
1764
DONE
1765
DONE
1766
DONE
1767
DONE
1768
DONE
1769
DONE
1770
DONE
1771
DONE
1772
DONE
1773
DONE
1774
DONE
1775
DONE
1776
DONE
1777
DONE
1778
DONE
1779
DONE
1780
DONE
1781
DONE
1782
DONE
1783
DONE
1784
DONE
1785
DONE
1786
DONE
1787
DONE
1788
DONE
1789
DONE
1790
DONE
1791
DONE
1792
DONE
1793
DONE
1794
DONE
1795
DONE
1796
DONE
1797
DONE
1798
DONE
1799
DONE
1800
DONE
1801
DONE
1802
DONE
1803
DONE
1804
DONE
1805
DONE
1806
DONE
1807
DONE
1808
DONE
1809
DONE
1810
DONE
1811
DONE
1812
DONE
1813
DONE
1814
DONE
1815
DONE
1816
DONE
1817
DONE
1818
DONE
1819
DONE
1820
DONE
1821
DONE
1822
DONE
1823
DONE
1824
DONE
1825
DONE
1826
DONE
1827
DONE
1828
DONE
1829
DONE
1830
DONE
1831
DONE
1832
DONE
1833
DONE
1834
DONE
1835
DONE
1836
DONE
1837
DONE
1838
DONE
1839
DONE
1840
DONE
1841
DONE
1842
DONE
1843
DONE
1844
DONE
1845
DONE
1846
DONE
1847
DONE
1848
DONE
1849
DONE
1850
DONE
1851
DONE
1852


DONE
2574
DONE
2575
DONE
2576
DONE
2577
DONE
2578
DONE
2579
DONE
2580
DONE
2581
DONE
2582
DONE
2583
DONE
2584
DONE
2585
DONE
2586
DONE
2587
DONE
2588
DONE
2589
DONE
2590
DONE
2591
DONE
2592
DONE
2593
DONE
2594
DONE
2595
DONE
2596
DONE
2597
DONE
2598
DONE
2599
DONE
2600
DONE
2601
DONE
2602
DONE
2603
DONE
2604
DONE
2605
DONE
2606
DONE
2607
DONE
2608
DONE
2609
DONE
2610
DONE
2611
DONE
2612
DONE
2613
DONE
2614
DONE
2615
DONE
2616
DONE
2617
DONE
2618
DONE
2619
DONE
2620
DONE
2621
DONE
2622
DONE
2623
DONE
2624
DONE
2625
DONE
2626
DONE
2627
DONE
2628
DONE
2629
DONE
2630
DONE
2631
DONE
2632
DONE
2633
DONE
2634
DONE
2635
DONE
2636
DONE
2637
DONE
2638
DONE
2639
DONE
2640
DONE
2641
DONE
2642
DONE
2643
DONE
2644
DONE
2645
DONE
2646
DONE
2647
DONE
2648
DONE
2649
DONE
2650
DONE
2651
DONE
2652
DONE
2653
DONE
2654
DONE
2655
DONE
2656
DONE
2657
DONE
2658
DONE
2659
DONE
2660
DONE
2661
DONE
2662
DONE
2663
DONE
2664
DONE
2665
DONE
2666
DONE
2667
DONE
2668
DONE
2669
DONE
2670
DONE
2671
DONE
2672
DONE
2673


DONE
3394
DONE
3395
DONE
3396
DONE
3397
DONE
3398
DONE
3399
DONE
3400
DONE
3401
DONE
3402
DONE
3403
DONE
3404
DONE
3405
DONE
3406
DONE
3407
DONE
3408
DONE
3409
DONE
3410
DONE
3411
DONE
3412
DONE
3413
DONE
3414
DONE
3415
DONE
3416
DONE
3417
DONE
3418
DONE
3419
DONE
3420
DONE
3421
DONE
3422
DONE
3423
DONE
3424
DONE
3425
DONE
3426
DONE
3427
DONE
3428
DONE
3429
DONE
3430
DONE
3431
DONE
3432
DONE
3433
DONE
3434
DONE
3435
DONE
3436
DONE
3437
DONE
3438
DONE
3439
DONE
3440
DONE
3441
DONE
3442
DONE
3443
DONE
3444
DONE
3445
DONE
3446
DONE
3447
DONE
3448
DONE
3449
DONE
3450
DONE
3451
DONE
3452
DONE
3453
DONE
3454
DONE
3455
DONE
3456
DONE
3457
DONE
3458
DONE
3459
DONE
3460
DONE
3461
DONE
3462
DONE
3463
DONE
3464
DONE
3465
DONE
3466
DONE
3467
DONE
3468
DONE
3469
DONE
3470
DONE
3471
DONE
3472
DONE
3473
DONE
3474
DONE
3475
DONE
3476
DONE
3477
DONE
3478
DONE
3479
DONE
3480
DONE
3481
DONE
3482
DONE
3483
DONE
3484
DONE
3485
DONE
3486
DONE
3487
DONE
3488
DONE
3489
DONE
3490
DONE
3491
DONE
3492
DONE
3493


DONE
4214
DONE
4215
DONE
4216
DONE
4217
DONE
4218
DONE
4219
DONE
4220
DONE
4221
DONE
4222
DONE
4223
DONE
4224
DONE
4225
DONE
4226
DONE
4227
DONE
4228
DONE
4229
DONE
4230
DONE
4231
DONE
4232
DONE
4233
DONE
4234
DONE
4235
DONE
4236
DONE
4237
DONE
4238
DONE
4239
DONE
4240
DONE
4241
DONE
4242
DONE
4243
DONE
4244
DONE
4245
DONE
4246
DONE
4247
DONE
4248
DONE
4249
DONE
4250
DONE
4251
DONE
4252
DONE
4253
DONE
4254
DONE
4255
DONE
4256
DONE
4257
DONE
4258
DONE
4259
DONE
4260
DONE
4261
DONE
4262
DONE
4263
DONE
4264
DONE
4265
DONE
4266
DONE
4267
DONE
4268
DONE
4269
DONE
4270
DONE
4271
DONE
4272
DONE
4273
DONE
4274
DONE
4275
DONE
4276
DONE
4277
DONE
4278
DONE
4279
DONE
4280
DONE
4281
DONE
4282
DONE
4283
DONE
4284
DONE
4285
DONE
4286
DONE
4287
DONE
4288
DONE
4289
DONE
4290
DONE
4291
DONE
4292
DONE
4293
DONE
4294
DONE
4295
DONE
4296
DONE
4297
DONE
4298
DONE
4299
DONE
4300
DONE
4301
DONE
4302
DONE
4303
DONE
4304
DONE
4305
DONE
4306
DONE
4307
DONE
4308
DONE
4309
DONE
4310
DONE
4311
DONE
4312
DONE
4313


DONE
5035
DONE
5036
DONE
5037
DONE
5038
DONE
5039
DONE
5040
DONE
5041
DONE
5042
DONE
5043
DONE
5044
DONE
5045
DONE
5046
DONE
5047
DONE
5048
DONE
5049
DONE
5050
DONE
5051
DONE
5052
DONE
5053
DONE
5054
DONE
5055
DONE
5056
DONE
5057
DONE
5058
DONE
5059
DONE
5060
DONE
5061
DONE
5062
DONE
5063
DONE
5064
DONE
5065
DONE
5066
DONE
5067
DONE
5068
DONE
5069
DONE
5070
DONE
5071
DONE
5072
DONE
5073
DONE
5074
DONE
5075
DONE
5076
DONE
5077
DONE
5078
DONE
5079
DONE
5080
DONE
5081
DONE
5082
DONE
5083
DONE
5084
DONE
5085
DONE
5086
DONE
5087
DONE
5088
DONE
5089
DONE
5090
DONE
5091
DONE
5092
DONE
5093
DONE
5094
DONE
5095
DONE
5096
DONE
5097
DONE
5098
DONE
5099
DONE
5100
DONE
5101
DONE
5102
DONE
5103
DONE
5104
DONE
5105
DONE
5106
DONE
5107
DONE
5108
DONE
5109
DONE
5110
DONE
5111
DONE
5112
DONE
5113
DONE
5114
DONE
5115
DONE
5116
DONE
5117
DONE
5118
DONE
5119
DONE
5120
DONE
5121
DONE
5122
DONE
5123
DONE
5124
DONE
5125
DONE
5126
DONE
5127
DONE
5128
DONE
5129
DONE
5130
DONE
5131
DONE
5132
DONE
5133
DONE
5134


DONE
5855
DONE
5856
DONE
5857
DONE
5858
DONE
5859
DONE
5860
DONE
5861
DONE
5862
DONE
5863
DONE
5864
DONE
5865
DONE
5866
DONE
5867
DONE
5868
DONE
5869
DONE
5870
DONE
5871
DONE
5872
DONE
5873
DONE
5874
DONE
5875
DONE
5876
DONE
5877
DONE
5878
DONE
5879
DONE
5880
DONE
5881
DONE
5882
DONE
5883
DONE
5884
DONE
5885
DONE
5886
DONE
5887
DONE
5888
DONE
5889
DONE
5890
DONE
5891
DONE
5892
DONE
5893
DONE
5894
DONE
5895
DONE
5896
DONE
5897
DONE
5898
DONE
5899
DONE
5900
DONE
5901
DONE
5902
DONE
5903
DONE
5904
DONE
5905
DONE
5906
DONE
5907
DONE
5908
DONE
5909
DONE
5910
DONE
5911
DONE
5912
DONE
5913
DONE
5914
DONE
5915
DONE
5916
DONE
5917
DONE
5918
DONE
5919
DONE
5920
DONE
5921
DONE
5922
DONE
5923
DONE
5924
DONE
5925
DONE
5926
DONE
5927
DONE
5928
DONE
5929
DONE
5930
DONE
5931
DONE
5932
DONE
5933
DONE
5934
DONE
5935
DONE
5936
DONE
5937
DONE
5938
DONE
5939
DONE
5940
DONE
5941
DONE
5942
DONE
5943
DONE
5944
DONE
5945
DONE
5946
DONE
5947
DONE
5948
DONE
5949
DONE
5950
DONE
5951
DONE
5952
DONE
5953
DONE
5954


DONE
6676
DONE
6677
DONE
6678
DONE
6679
DONE
6680
DONE
6681
DONE
6682
DONE
6683
DONE
6684
DONE
6685
DONE
6686
DONE
6687
DONE
6688
DONE
6689
DONE
6690
DONE
6691
DONE
6692
DONE
6693
DONE
6694
DONE
6695
DONE
6696
DONE
6697
DONE
6698
DONE
6699
DONE
6700
DONE
6701
DONE
6702
DONE
6703
DONE
6704
DONE
6705
DONE
6706
DONE
6707
DONE
6708
DONE
6709
DONE
6710
DONE
6711
DONE
6712
DONE
6713
DONE
6714
DONE
6715
DONE
6716
DONE
6717
DONE
6718
DONE
6719
DONE
6720
DONE
6721
DONE
6722
DONE
6723
DONE
6724
DONE
6725
DONE
6726
DONE
6727
DONE
6728
DONE
6729
DONE
6730
DONE
6731
DONE
6732
DONE
6733
DONE
6734
DONE
6735
DONE
6736
DONE
6737
DONE
6738
DONE
6739
DONE
6740
DONE
6741
DONE
6742
DONE
6743
DONE
6744
DONE
6745
DONE
6746
DONE
6747
DONE
6748
DONE
6749
DONE
6750
DONE
6751
DONE
6752
DONE
6753
DONE
6754
DONE
6755
DONE
6756
DONE
6757
DONE
6758
DONE
6759
DONE
6760
DONE
6761
DONE
6762
DONE
6763
DONE
6764
DONE
6765
DONE
6766
DONE
6767
DONE
6768
DONE
6769
DONE
6770
DONE
6771
DONE
6772
DONE
6773
DONE
6774
DONE
6775


DONE
7496
DONE
7497
DONE
7498
DONE
7499
DONE
7500
DONE
7501
DONE
7502
DONE
7503
DONE
7504
DONE
7505
DONE
7506
DONE
7507
DONE
7508
DONE
7509
DONE
7510
DONE
7511
DONE
7512
DONE
7513
DONE
7514
DONE
7515
DONE
7516
DONE
7517
DONE
7518
DONE
7519
DONE
7520
DONE
7521
DONE
7522
DONE
7523
DONE
7524
DONE
7525
DONE
7526
DONE
7527
DONE
7528
DONE
7529
DONE
7530
DONE
7531
DONE
7532
DONE
7533
DONE
7534
DONE
7535
DONE
7536
DONE
7537
DONE
7538
DONE
7539
DONE
7540
DONE
7541
DONE
7542
DONE
7543
DONE
7544
DONE
7545
DONE
7546
DONE
7547
DONE
7548
DONE
7549
DONE
7550
DONE
7551
DONE
7552
DONE
7553
DONE
7554
DONE
7555
DONE
7556
DONE
7557
DONE
7558
DONE
7559
DONE
7560
DONE
7561
DONE
7562
DONE
7563
DONE
7564
DONE
7565
DONE
7566
DONE
7567
DONE
7568
DONE
7569
DONE
7570
DONE
7571
DONE
7572
DONE
7573
DONE
7574
DONE
7575
DONE
7576
DONE
7577
DONE
7578
DONE
7579
DONE
7580
DONE
7581
DONE
7582
DONE
7583
DONE
7584
DONE
7585
DONE
7586
DONE
7587
DONE
7588
DONE
7589
DONE
7590
DONE
7591
DONE
7592
DONE
7593
DONE
7594
DONE
7595


DONE
8318
DONE
8319
DONE
8320
DONE
8321
DONE
8322
DONE
8323
DONE
8324
DONE
8325
DONE
8326
DONE
8327
DONE
8328
DONE
8329
DONE
8330
DONE
8331
DONE
8332
DONE
8333
DONE
8334
DONE
8335
DONE
8336
DONE
8337
DONE
8338
DONE
8339
DONE
8340
DONE
8341
DONE
8342
DONE
8343
DONE
8344
DONE
8345
DONE
8346
DONE
8347
DONE
8348
DONE
8349
DONE
8350
DONE
8351
DONE
8352
DONE
8353
DONE
8354
DONE
8355
DONE
8356
DONE
8357
DONE
8358
DONE
8359
DONE
8360
DONE
8361
DONE
8362
DONE
8363
DONE
8364
DONE
8365
DONE
8366
DONE
8367
DONE
8368
DONE
8369
DONE
8370
DONE
8371
DONE
8372
DONE
8373
DONE
8374
DONE
8375
DONE
8376
DONE
8377
DONE
8378
DONE
8379
DONE
8380
DONE
8381
DONE
8382
DONE
8383
DONE
8384
DONE
8385
DONE
8386
DONE
8387
DONE
8388
DONE
8389
DONE
8390
DONE
8391
DONE
8392
DONE
8393
DONE
8394
DONE
8395
DONE
8396
DONE
8397
DONE
8398
DONE
8399
DONE
8400
DONE
8401
DONE
8402
DONE
8403
DONE
8404
DONE
8405
DONE
8406
DONE
8407
DONE
8408
DONE
8409
DONE
8410
DONE
8411
DONE
8412
DONE
8413
DONE
8414
DONE
8415
DONE
8416
DONE
8417


DONE
9141
DONE
9142
DONE
9143
DONE
9144
DONE
9145
DONE
9146
DONE
9147
DONE
9148
DONE
9149
DONE
9150
DONE
9151
DONE
9152
DONE
9153
DONE
9154
DONE
9155
DONE
9156
DONE
9157
DONE
9158
DONE
9159
DONE
9160
DONE
9161
DONE
9162
DONE
9163
DONE
9164
DONE
9165
DONE
9166
DONE
9167
DONE
9168
DONE
9169
DONE
9170
DONE
9171
DONE
9172
DONE
9173
DONE
9174
DONE
9175
DONE
9176
DONE
9177
DONE
9178
DONE
9179
DONE
9180
DONE
9181
DONE
9182
DONE
9183
DONE
9184
DONE
9185
DONE
9186
DONE
9187
DONE
9188
DONE
9189
DONE
9190
DONE
9191
DONE
9192
DONE
9193
DONE
9194
DONE
9195
DONE
9196
DONE
9197
DONE
9198
DONE
9199
DONE
9200
DONE
9201
DONE
9202
DONE
9203
DONE
9204
DONE
9205
DONE
9206
DONE
9207
DONE
9208
DONE
9209
DONE
9210
DONE
9211
DONE
9212
DONE
9213
DONE
9214
DONE
9215
DONE
9216
DONE
9217
DONE
9218
DONE
9219
DONE
9220
DONE
9221
DONE
9222
DONE
9223
DONE
9224
DONE
9225
DONE
9226
DONE
9227
DONE
9228
DONE
9229
DONE
9230
DONE
9231
DONE
9232
DONE
9233
DONE
9234
DONE
9235
DONE
9236
DONE
9237
DONE
9238
DONE
9239
DONE
9240


DONE
9964
DONE
9965
DONE
9966
DONE
9967
DONE
9968
DONE
9969
DONE
9970
DONE
9971
DONE
9972
DONE
9973
DONE
9974
DONE
9975
DONE
9976
DONE
9977
DONE
9978
DONE
9979
DONE
9980
DONE
9981
DONE
9982
DONE
9983
DONE
9984
DONE
9985
DONE
9986
DONE
9987
DONE
9988
DONE
9989
DONE
9990
DONE
9991
DONE
9992
DONE
9993
DONE
9994
DONE
9995
DONE
9996
DONE
9997
DONE
9998
DONE
9999
DONE
Wall time: 9min 53s


In [68]:
#Save assigments:
aux_vars_ids=["id_profiles", "id_entities"]
result=result[aux_vars_ids]
result.to_csv(os.path.join(get_output_path(), "results_proto_2.csv"), sep=";", index=False)

In [69]:
profile_ids=np.unique(profiles_df.id)
result_proto2_df=eval_fun(profile_ids, aux_vars_ids, result)
print("--Recall--")# From the true positives - how many positives
print("Mean average of entity match {} ".format(np.mean(result_proto2_df["per_match"])))
print("Median average of entity match {} ".format(np.median(result_proto2_df["per_match"])))

1. Not all profiles have one entity
--Recall--
Mean average of entity match 0.0126 
Median average of entity match 0.0 


In [70]:
result_proto2_df.describe()
#Probably we should take out the country and city variables (looks that they are biasing)

Unnamed: 0,id_profile,num_entities_matched,real_entitites_matched,per_match
count,10000.0,10000.0,10000.0,10000.0
mean,425235.2,9.8357,1.4799,0.0126
std,484765.0,1.075649,2.682781,0.737222
min,403.0,0.0,1.0,0.0
25%,66235.0,10.0,1.0,0.0
50%,191056.0,10.0,1.0,0.0
75%,804856.0,10.0,1.0,0.0
max,1868021.0,10.0,89.0,50.0


In [None]:
# ANOTHER VERSION w2V #
# # For the company_name is more tricky -there are thousands of categories. As an experiment, it could be interesting to 
# # load an already german word2vec trained and use it to convert the company names into embeddings.
# import pandas as pd
# import io
# import requests

# url="https://int-emb-word2vec-de-wiki.s3.eu-central-1.amazonaws.com/vectors.txt"

# s=requests.get(url).content
# c=pd.read_csv(io.StringIO(s.decode('utf-8')))