In [None]:
import recordlinkage as rl
import pandas as pd
import os
import random as ran
from IPython.display import clear_output


In [None]:
schema_path = "./schema_matching/csv/schema_final.csv"

In [None]:
schema = pd.read_csv(schema_path, index_col=0)
schema = schema.reset_index(drop=True) # Per evitare che gli indici siano letti come decimanli (i.e. 1.0 invece di 1)

# Preprocessamento

In [None]:
from recordlinkage.preprocessing import *

## Cleaning dei dati ...

In [None]:
schema_cleaned = schema # creo una copia per non modificare lo schema iniziale

In [None]:
# Cleaning dei dati iterando per riga
for i in range(len(schema.index)):
    row = schema.iloc[i]
    row_series = row.squeeze() # BOO
    nome = pd.Series(row["name"])
    try:
        nome_cleaned = clean(nome, lowercase=True, replace_by_none='', replace_by_whitespace='', strip_accents='unicode', remove_brackets=False, encoding='utf-8', decode_error='strict')
    except:
        print(nome)
    # Si evita il cleaning di caratteri relative alle valute presenti nei datasets
    row_series_cleaned = clean(row_series, lowercase=True, replace_by_none=r'[^ \\.\\-\\_A-Za-z0-9$€£¥₩₽₹฿₪¤¢₺₱]+', replace_by_whitespace=r'[\\-\\_]', strip_accents=None, remove_brackets=False, encoding='utf-8', decode_error='strict')
    schema_cleaned.iloc[i] = row_series_cleaned
    schema_cleaned.iloc[i]["name"] = nome_cleaned[0]
print(schema_cleaned)

# Occorrenze token per riga

In [None]:
# value_occurence(schema)

# Blocking

In [None]:
indexer = rl.Index()
indexer.block(left_on='name', right_on='name')
candidate_links = indexer.index(schema_cleaned)
print((candidate_links))

# Labeling

In [None]:
keys = ['name', 'country', 'market cap', 'founded year', 'employees', 'industry', 'sector',
     'ceo', 'revenue', 'stock', 'share price', 'city', 'address', 'website']

clear = lambda: os.system('clear')

In [None]:
n_match = 500
choices = ran.choices(candidate_links, k=n_match)

lKeys = {key : f"l_{key}" for key in keys}
rKeys = {key : f"r_{key}" for key in keys}

idxKeys = ["id_1", "id_2"]

choices_column = [*idxKeys, *lKeys.values(), *rKeys.values()] # id_1, id_2, l_attr, r_attr
choices_df = pd.DataFrame(columns=choices_column) # 
for choice in choices:
    lRow = schema_cleaned.iloc[[choice[0]]]
    rRow = schema_cleaned.iloc[[choice[1]]]

    lRow = lRow.rename(columns=lKeys).reset_index(drop=True)
    rRow = rRow.rename(columns=rKeys).reset_index(drop=True)

    idxs = pd.DataFrame({idxKeys[0]: [choice[0]], idxKeys[1]: [choice[1]]})
    row = pd.concat([idxs, lRow, rRow], axis=1)
    choices_df = pd.concat([choices_df, row], axis=0)

# choices_df.reset_index(inplace=True)
choices_df.set_index(idxKeys)

In [None]:
choices_path = "./schema_matching/csv/choices.csv"
choices_df.to_csv(choices_path, index=None)


In [None]:
choices_df = pd.read_csv(choices_path, index_col=[0, 1])
choices_df

In [None]:
golden_path = "./schema_matching/csv/golden_links.csv"
try: 
    golden = pd.read_csv(golden_path, index_col=None)
except:
    golden = pd.DataFrame(columns=[*choices_column, "label"])
    golden.set_index(idxKeys)

In [None]:
golden

In [None]:
for idxs, choice in choices_df.iterrows():
    print("I due record appartengono alla stessa entità?")
    print("y -> sì;  n -> no;  q -> esci")
    choice = choice.to_frame().T.reset_index(drop=True)

    # pretty-print per jupyter-notebook
    display(choice[lKeys.values()])
    display(choice[rKeys.values()])

    stop = False
    while(True):
        inp = input()
        
        if(inp == 'q'):
            stop = True
            break
        
        if inp not in ["y", "n"]:
            continue

        # Da concatentare al db golden
        idx = pd.DataFrame({idxKeys[0]: [idxs[0]], idxKeys[1]: [idxs[1]]}) # dataframe["id_1", "id_2"] dove sono salvati gli indici delle ennuple da etichettare
        label = pd.DataFrame({"label" : [int(inp == "y")]}) # dataframe con attributo label e il valore associato

        row = pd.concat([idx, choice, label], axis=1) #
        golden = pd.concat([golden, row], axis=0)
        
        # drop delle righe che sono state già etichettate
        choices_df.drop(axis=0, labels=idxs, inplace=True)
        break
    
    if(stop):
        break
    clear_output()


In [None]:
golden

In [None]:
golden.reset_index(drop=True, inplace=True)
golden

In [None]:
choices_df

In [None]:
choices_df.to_csv(choices_path)
golden.to_csv(golden_path, index=None)

In [None]:
print(len(candidate_links)/len(schema_cleaned)**2)

# Pre-Labeling

In [None]:
compare_cl = rl.Compare()
compare_cl.string("name", "name", threshold=0.85, label="name")
compare_cl.string("country", "country", label="country")
compare_cl.string("founded year", "founded year", label="founded year")
compare_cl.string("industry", "industry", label="industry")
compare_cl.string("sector", "sector", label="sector")
compare_cl.string("address", "address", label="address")
compare_cl.string("city", "city", label="city")
compare_cl.string("ceo", "ceo", label="ceo")
features = compare_cl.compute(candidate_links, schema, schema)

In [None]:
features

In [None]:
features.describe()

In [None]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

In [None]:
record_linked = features[features.sum(axis=1) > 3]

In [None]:
print(record_linked)

# Classification

In [None]:
ECM = rl.ECMClassifier(binarize=0.6)
record_linked.index.names = (['id1', 'id2'])
print(len(record_linked.index))
result = ECM.fit_predict(record_linked)
print(len(result))