## Record Linkage del dataset

In [10]:
import pandas as pd
import numpy as np

import recordlinkage
import warnings
from recordlinkage.index import Full

warnings.filterwarnings('ignore')

In [11]:
companies = pd.read_csv('schemaAlignment/aziende_merged_finale.csv')
companies

Unnamed: 0.1,Unnamed: 0,name,company_website,country,number_of_employees,industry,founded_year,market_cap,revenue,rank,sector,city,valuation,stock,ceo,market_value
0,0,Proteus Digital Health Careers,www.proteus.com,"Redwood City, CA",201 to 500 employees,Pharmaceuticals,2004,,,,,,,,,
1,1,Zelis Careers,www.zelis.com,"Bedminster, NJ","1,001 to 5,000 employees",Information Technology & Services,2016,,,,,,,,,
2,2,UpNest Careers,www.upnest.com,"Burlingame, CA",1 to 50 employees,Real Estate,2013,,,,,,,,,
3,3,conferacity Careers,conferacity.com,"Menlo Park, CA",1 to 50 employees,Broadcast & Online Media,2013,,,,,,,,,
4,4,Zenoti Careers,www.zenoti.com,"Bellevue, WA","501 to 1,000 employees",Computer Software,2010,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120254,120254,Wayne's Coffee,,,,Consumer services,,,,,Restaurants & bars,,,,,
120255,120255,WESC,,,,Consumer goods,,,,,Clothing & accessories,,,,,
120256,120256,WG Film,,,,Consumer services,,,,,Broadcasting & entertainment,,,,,
120257,120257,WM-data,,,,Technology,,,,,Software,,,,,


In [12]:
companies = companies.drop(['Unnamed: 0'], axis=1)
companies.head()

Unnamed: 0,name,company_website,country,number_of_employees,industry,founded_year,market_cap,revenue,rank,sector,city,valuation,stock,ceo,market_value
0,Proteus Digital Health Careers,www.proteus.com,"Redwood City, CA",201 to 500 employees,Pharmaceuticals,2004,,,,,,,,,
1,Zelis Careers,www.zelis.com,"Bedminster, NJ","1,001 to 5,000 employees",Information Technology & Services,2016,,,,,,,,,
2,UpNest Careers,www.upnest.com,"Burlingame, CA",1 to 50 employees,Real Estate,2013,,,,,,,,,
3,conferacity Careers,conferacity.com,"Menlo Park, CA",1 to 50 employees,Broadcast & Online Media,2013,,,,,,,,,
4,Zenoti Careers,www.zenoti.com,"Bellevue, WA","501 to 1,000 employees",Computer Software,2010,,,,,,,,,


In [13]:
# Indicizzazione dei dati
indexer = recordlinkage.Index()
indexer.block('name') # Blocco sulla colonna "name" (blocco informazioni utilizzate per l'indicizzazione)
pairs = indexer.index(companies, companies)

print(pairs)

MultiIndex([(     0,      0),
            (     1,      1),
            (     2,      2),
            (     3,      3),
            (     4,      4),
            (     5,      5),
            (     6,      6),
            (     7,      7),
            (     8,      8),
            (     9,      9),
            ...
            (120248, 120248),
            (120249, 120249),
            (120251, 120251),
            (120252, 120252),
            (120253, 120253),
            (120254, 120254),
            (120255, 120255),
            (120256, 120256),
            (120257, 120257),
            (120258, 120258)],
           length=432050)


In [14]:
num_companies = companies['name']
np.size(num_companies)

120259

In [15]:
unique_companies = companies['name'].unique()
np.size(unique_companies)

71243

### Training

In [16]:
train_pairs = pairs[:216000]
test_pairs = pairs[216000:]

In [17]:
companies.columns

Index(['name', 'company_website', 'country', 'number_of_employees', 'industry',
       'founded_year', 'market_cap', 'revenue', 'rank', 'sector', 'city',
       'valuation', 'stock', 'ceo', 'market_value'],
      dtype='object')

In [20]:
# Colonne che si vogliono confrontare
compare = recordlinkage.Compare()

compare.string("name", "name", method="jarowinkler", threshold=0.85)
compare.string("country", "country", method="jarowinkler", threshold=0.85)
compare.numeric("founded_year", "founded_year", method="linear")
compare.numeric("revenue", "revenue", method="linear")
compare.string("sector", "sector", method="jarowinkler", threshold=0.85)

<Compare>

In [21]:
# Calcolo funzioni di confronto
training_features = compare.compute(train_pairs, companies, companies)

# Colonna score somma i valori di confronto per ogni coppia di record
training_features['score'] = training_features.loc[:, 'name':'city'].sum(axis=1)

: 

: 

In [None]:
# Seleziona solo le coppie di record che hanno un punteggio maggiore di 2 e reset dell'indice
training_matches = training_features[training_features.sum(axis=1) > 2].reset_index()

# Aggiornamento colonna score con la somma dei valori di confronto per ogni coppia di record selezionate
training_matches['score'] = training_matches.loc[:, 'name':'city'].sum(axis=1)

toDrop = ['name', 'company_website', 'country', 'city', 'industry', 'founded_year', 'score']
training_matches = training_matches.drop(toDrop, axis=1)
training_matches.head(9)

In [None]:
training_matches = pd.MultiIndex.from_frame(training_matches) #matches conversion
training_features.head()

In [None]:
len(training_features)

In [None]:
len(training_matches)

In [None]:
classifier = recordlinkage.NaiveBayesClassifier()
classifier.fit(training_features, training_matches)

### Test 

In [None]:
# Colonne che si vogliono confrontare
compare = recordlinkage.Compare()

compare.string("name", "name", method="jarowinkler", threshold=0.85)
compare.string("country", "country", method="jarowinkler", threshold=0.85)
compare.numeric("founded_year", "founded_year", method="linear")
compare.numeric("revenue", "revenue", method="linear")
compare.string("sector", "sector", method="jarowinkler", threshold=0.85)

In [None]:
# Calcolo funzioni di confronto
test_features = compare.compute(test_pairs, companies, companies)

# Colonna score somma i valori di confronto per ogni coppia di record
test_features['score'] = test_features.loc[:, 'name':'city'].sum(axis=1)

In [None]:
# Seleziona solo le coppie di record che hanno un punteggio maggiore di 2 e reset dell'indice
test_matches = test_features[test_features.sum(axis=1) > 2].reset_index()

# Aggiornamento colonna score con la somma dei valori di confronto per ogni coppia di record selezionate
test_matches['score'] = test_matches.loc[:, 'name':'city'].sum(axis=1)

toDrop = ['name', 'company_website', 'country', 'city', 'industry', 'founded_year', 'score']
test_matches = test_matches.drop(toDrop, axis=1)
test_matches.head(9)

In [None]:
len(test_features)

In [None]:
len(test_matches)

In [None]:
test_matches

In [None]:
test_matches = pd.MultiIndex.from_frame(test_matches) #matches conversion

In [None]:
predictions = classifier.predict(test_features)

### Evaluation

In [None]:
# return the confusion matrix
confusion_matrix = recordlinkage.confusion_matrix(test_matches, predictions, len(test_features))
print('confusion matrix')
print(confusion_matrix)

# compute the F-score for this classification
fscore = recordlinkage.fscore(confusion_matrix)
print('\n\nfscore', fscore)
recall = recordlinkage.recall(test_matches, predictions)
print('recall', recall)
precision = recordlinkage.precision(test_matches, predictions)
print('precision', precision)
accuracy = recordlinkage.accuracy(test_matches, predictions, len(test_features))
print('accuracy', accuracy)

In [None]:
print(len(predictions))

In [None]:
false_negatives = test_matches.difference(predictions)
false_negatives

In [None]:
try:
    fn_from_dfA = false_negatives[0][0]
    fn_from_dfB = false_negatives[0][1]

    display(companies[companies.index == fn_from_dfA])
    display(companies[companies.index == fn_from_dfB])
except:
    print("No False Negatives Present")