In [21]:
import recordlinkage as rl
import pandas as pd
import os
import random as ran
from sklearn.model_selection import GridSearchCV

In [22]:
schema_path = "../schema_matching/csv/schema_final.csv"

In [23]:
schema = pd.read_csv(schema_path, index_col=0)
schema = schema.reset_index(drop=True) # Per evitare che gli indici siano letti come decimanli (i.e. 1.0 invece di 1)

  schema = pd.read_csv(schema_path, index_col=0)


# Preprocessamento

In [24]:
from recordlinkage.preprocessing import *

## Cleaning dei dati ...

In [25]:
schema_cleaned = schema # creo una copia per non modificare lo schema iniziale
schema_cleaned_path = "./csv/schema_cleaned.csv"

In [26]:
# Cleaning dei dati iterando per riga
for i in range(len(schema.index)):
    row = schema.iloc[i]
    row_series = row.squeeze() # BOO
    nome = pd.Series(row["name"])
    try:
        nome_cleaned = clean(nome, lowercase=True, replace_by_none='', replace_by_whitespace='\ {2,}', strip_accents='unicode', remove_brackets=False, encoding='utf-8', decode_error='strict')
    except:
        print(nome)
    # Si evita il cleaning di caratteri relative alle valute presenti nei datasets
    row_series_cleaned = clean(row_series, lowercase=True, replace_by_none=r'[^ \\.\\-\\_A-Za-z0-9$€£¥₩₽₹฿₪¤¢₺₱]+', replace_by_whitespace=r'[\\-\\_]', strip_accents="unicode", remove_brackets=False, encoding='utf-8', decode_error='strict')
    schema_cleaned.iloc[i] = row_series_cleaned
    schema_cleaned.iloc[i]["name"] = nome_cleaned[0]

In [27]:
schema_cleaned.to_csv(schema_cleaned_path, index_label="id")

# Occorrenze token per riga

In [28]:
# value_occurence(schema)

# Blocking

In [29]:
indexer = rl.Index()
indexer.block(left_on='name', right_on='name')
candidate_links = indexer.index(schema_cleaned)
print(candidate_links)

MultiIndex([( 14290,      0),
            ( 22400,      0),
            ( 22400,  14290),
            (111211,      0),
            (111211,  14290),
            (111211,  22400),
            (113157,      0),
            (113157,  14290),
            (113157,  22400),
            (113157, 111211),
            ...
            (186178, 156678),
            (186180, 156680),
            (186181, 156681),
            (186182, 156682),
            (186183, 156683),
            (186184, 156684),
            (186185, 156685),
            (186186, 156686),
            (186187, 156687),
            (184572, 184571)],
           length=538711)


# Prepare Labeling

In [30]:
keys = ['name', 'country', 'market cap', 'founded year', 'employees', 'industry', 'sector',
     'ceo', 'revenue', 'stock', 'share price', 'city', 'address', 'website']

In [31]:
n_sampled_matches = 30000
choices = ran.choices(candidate_links, k=n_sampled_matches)

lKeys = {key : f"l_{key}" for key in keys}
rKeys = {key : f"r_{key}" for key in keys}

idxKeys = ["id_1", "id_2"]

choices_column = [*idxKeys, *lKeys.values(), *rKeys.values()] # id_1, id_2, l_attr, r_attr
choices_df = pd.DataFrame(columns=choices_column) # 
for choice in choices:
    lRow = schema_cleaned.iloc[[choice[0]]]
    rRow = schema_cleaned.iloc[[choice[1]]]

    lRow = lRow.rename(columns=lKeys).reset_index(drop=True)
    rRow = rRow.rename(columns=rKeys).reset_index(drop=True)

    idxs = pd.DataFrame({idxKeys[0]: [choice[0]], idxKeys[1]: [choice[1]]})
    row = pd.concat([idxs, lRow, rRow], axis=1)
    choices_df = pd.concat([choices_df, row], axis=0)

# choices_df.reset_index(inplace=True)
choices_df.set_index(idxKeys)

Unnamed: 0_level_0,Unnamed: 1_level_0,l_name,l_country,l_market cap,l_founded year,l_employees,l_industry,l_sector,l_ceo,l_revenue,l_stock,...,r_employees,r_industry,r_sector,r_ceo,r_revenue,r_stock,r_share price,r_city,r_address,r_website
id_1,id_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
137773,63498,bayer,germany,,,,,,,$52.1 b,,...,,,,,,,,,,
183211,95597,rio tinto,,,1873.0,,construction oil gas operations mining and che...,,,$63.5b,,...,,,,,,,$77.81,,,
115176,100221,tencent,,,,112771,,,ma huateng,,,...,112771,,,ma huateng,,,,,,httpswww.tencent.com
60569,21874,cellnex telecom,spain,$32.98 b,,,,,,,,...,,,,,,,$34.11,,,
115534,95457,simon property group,,,,3300,,,david simon,,,...,,,,,,,$125.08,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123578,72504,avid bioservices,,,,257,,,,,,...,,healthcare healthcare products biotechnology b...,,not found,119 million usd,,,,,
19858,4628,wafd bank,usa,$2.17 b,,,,,,,,...,,,,,,,$33.83,,,
163354,55882,xebec adsorption,canada,,,,,,,,,...,,,,,,,$0.38,,,
118954,20199,games workshop group,,,,2436,,,,,,...,,,,,,,$87.91,,,


In [32]:
choices_path = "./csv/choices.csv"
choices_df.to_csv(choices_path, index=None)

In [33]:
choices_path_gallo = "./csv/gallo.csv"
choices_path_gatto = "./csv/gatto.csv"
choices_path_moli = "./csv/moli.csv"

sampled_matches_per_person = n_sampled_matches//3

choices_df_gallo = choices_df.iloc[:sampled_matches_per_person]
print(len(choices_df_gallo))

choices_df_gatto = choices_df.iloc[sampled_matches_per_person:2*sampled_matches_per_person]
print(len(choices_df_gatto))

choices_df_moli = choices_df.iloc[2*sampled_matches_per_person:]
print(len(choices_df_moli))

choices_df_gallo.to_csv(choices_path_gallo, index=None)
choices_df_gatto.to_csv(choices_path_gatto, index=None)
choices_df_moli.to_csv(choices_path_moli, index=None)

10000
10000
10000


# Comparing

In [34]:
compare_cl = rl.Compare()
compare_cl.string("name", "name", threshold=0.85, label="name")
compare_cl.string("country", "country", label="country")
compare_cl.string("founded year", "founded year", label="founded year")
compare_cl.string("industry", "industry", label="industry")
compare_cl.string("sector", "sector", label="sector")
compare_cl.string("address", "address", label="address")
compare_cl.string("city", "city", label="city")
compare_cl.string("ceo", "ceo", label="ceo")
features = compare_cl.compute(candidate_links, schema, schema)

In [35]:
features

Unnamed: 0,Unnamed: 1,name,country,founded year,industry,sector,address,city,ceo
14290,0,1.0,1.0,1.0,0.0,1.0,0.0000,0.0,0.0
22400,0,1.0,1.0,1.0,0.0,0.0,0.0000,0.0,0.0
22400,14290,1.0,1.0,1.0,0.0,0.0,0.0000,0.0,0.0
111211,0,1.0,1.0,1.0,0.0,1.0,0.0000,0.0,0.0
111211,14290,1.0,1.0,1.0,0.0,1.0,0.0000,0.0,0.0
...,...,...,...,...,...,...,...,...,...
186184,156684,1.0,0.0,1.0,1.0,1.0,0.0000,0.0,0.0
186185,156685,1.0,0.0,1.0,1.0,1.0,0.0000,0.0,0.0
186186,156686,1.0,0.0,1.0,1.0,1.0,0.0000,0.0,0.0
186187,156687,1.0,0.0,1.0,1.0,1.0,0.0000,0.0,0.0


In [36]:
features.describe()

Unnamed: 0,name,country,founded year,industry,sector,address,city,ceo
count,538711.0,538711.0,538711.0,538711.0,538711.0,538711.0,538711.0,538711.0
mean,1.0,0.437252,0.0437,0.035877,0.043175,0.003701,0.004944,0.050348
std,0.0,0.488796,0.195233,0.177248,0.202808,0.059819,0.070109,0.20542
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [37]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

6.000000       461
5.000000       287
4.733333         1
4.685714         1
4.457143         1
             ...  
1.017857         1
1.016667         1
1.016393         1
1.011820         1
1.000000    243056
Length: 5291, dtype: int64

In [38]:
record_linked = features[features.sum(axis=1) > 1]

In [39]:
print(record_linked)

               name  country  founded year  industry  sector  address  city  \
14290  0        1.0      1.0           1.0       0.0     1.0   0.0000   0.0   
22400  0        1.0      1.0           1.0       0.0     0.0   0.0000   0.0   
       14290    1.0      1.0           1.0       0.0     0.0   0.0000   0.0   
111211 0        1.0      1.0           1.0       0.0     1.0   0.0000   0.0   
       14290    1.0      1.0           1.0       0.0     1.0   0.0000   0.0   
...             ...      ...           ...       ...     ...      ...   ...   
186184 156684   1.0      0.0           1.0       1.0     1.0   0.0000   0.0   
186185 156685   1.0      0.0           1.0       1.0     1.0   0.0000   0.0   
186186 156686   1.0      0.0           1.0       1.0     1.0   0.0000   0.0   
186187 156687   1.0      0.0           1.0       1.0     1.0   0.0000   0.0   
184572 184571   1.0      0.0           0.0       0.0     0.0   0.1875   0.0   

               ceo  
14290  0       0.0  
22400  0 

# Caricamento dei dataset per classificazione

In [40]:
golden_path_gallo = "./csv/golden_links_gallo.csv"
golden_path_gatto = "./csv/golden_links_gatto.csv"
golden_path_moli = "./csv/golden_links_moli.csv"

golden_gallo = pd.read_csv(golden_path_gallo, index_col=None)
golden_gatto = pd.read_csv(golden_path_gatto, index_col=None)
golden_moli = pd.read_csv(golden_path_moli, index_col=None)

golden = pd.concat([golden_gallo, golden_gatto, golden_moli], axis=0)

golden_path = "./csv/golden_links.csv"

golden = golden.reset_index(drop=True)
golden.to_csv(golden_path, index=None)

FileNotFoundError: [Errno 2] No such file or directory: './csv/golden_links_gallo.csv'

In [None]:
golden = pd.read_csv(golden_path, index_col=[0,1])
golden_links = golden[golden["label"]==1].index

# Classification (Unsupervised)

In [None]:
ECM = rl.ECMClassifier(binarize=0.6)
result = ECM.fit_predict(record_linked)
print("Possible record linked size:", len(record_linked))
print("Record linked size:", len(result))
print(f"Removed {len(record_linked) - len(result)} possible links")

# Evaluation (Unsupervised)

### Confusion matrix

In [None]:
rl.confusion_matrix(golden_links, result)

### Precision

In [None]:
rl.precision(golden_links, result)

### Recall

In [None]:
rl.recall(golden_links, result)

### Accuracy

In [None]:
rl.accuracy(golden_links, result)

### F-score

In [None]:
rl.fscore(golden_links, result)

### Specificity

In [None]:
rl.specificity(golden_links, result)

# Classification (Supervised)

In [None]:
rlSvm = rl.SVMClassifier()
params = {}
gridSearch = GridSearchCV(estimator=rlSvm, param_grid=params)

# Evaluation (Supervised)

### Confusion matrix

In [None]:
rl.confusion_matrix(golden_links, )

### Precision

In [None]:
rl.precision(golden_links,)

### Recall

In [None]:
rl.recall(golden_links,)

### Accuracy

In [None]:
rl.accuracy(golden_links,)

### F-score

In [None]:
rl.fscore(golden_links,)

### Specificity

In [None]:
rl.specificity(golden_links,)