## Record Linkage del dataset

In [5]:
import pandas as pd
import numpy as np

import recordlinkage
import warnings
from recordlinkage.index import Full

warnings.filterwarnings('ignore')

In [6]:
cbi = pd.read_json('schemaAlignment/cbinsight.json')
cbi.head(10)

Unnamed: 0,name,valuation,dateJoined,country,city,industry,investors,founded,stage,totalRaised
0,Lacework,$8.3B,1/7/2021,United States,San Jose,Cybersecurity,"Sutter Hill Ventures, Liberty Global Ventures,...",2015,Series E,$1.907B
1,Tipalti,$8.3B,10/6/2020,United States,San Mateo,Fintech,"01 Advisors, Zeev Ventures, Group 11",2010,Series F,$556M
2,Tempus,$8.1B,3/21/2018,United States,Chicago,Health,"New Enterprise Associates, T. Rowe Associates,...",2015,Unattributed - II,$1.345B
3,Anduril,$8.48B,9/11/2019,United States,Irvine,Artificial intelligence,"Andreessen Horowitz, Founders Fund, Revolution...",2017,Series E,$2.309B
4,Bolt,$8.4B,5/29/2018,Estonia,Tallinn,Auto & transportation,"Didi Chuxing, Diamler, TMT Investments",2013,Series F,$1.971B
5,ByteDance,$140B,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S...",2012,Private Equity,$7.44B
6,Niantic,$9B,11/24/2017,United States,San Francisco,Mobile & telecommunications,"Nintendo, Google, Pokemon Company Internationa...",2015,Series D,$770M
7,OYO Rooms,$9B,9/25/2018,India,Gurugram,Travel,"SoftBank Group, Sequoia Capital India,Lightspe...",2012,Secondary Market,$3.114B
8,Kavak,$8.7B,10/1/2020,Mexico,Lerma de Villada,E-commerce & direct-to-consumer,"DST Global, SoftBank Group, Mountain Nazca",2014,Line of Credit - II,$2.702B
9,Personio,$8.5B,1/19/2021,Germany,Munich,Internet software & services,"Global Founders Capital, Nortzone Ventures, Pi...",2015,Series E - II,$724.83M


In [8]:
# Indicizzazione dei dati
indexer = recordlinkage.Index()
indexer.block('name') # Blocco sulla colonna "name" (blocco informazioni utilizzate per l'indicizzazione)
pairs = indexer.index(cbi, cbi)

In [9]:
print(pairs)

MultiIndex([(   0,    0),
            (   1,    1),
            (   2,    2),
            (   3,    3),
            (   4,    4),
            (   4, 1145),
            (1145,    4),
            (1145, 1145),
            (   5,    5),
            (   6,    6),
            ...
            (1175, 1175),
            (1176, 1176),
            (1177, 1177),
            (1178, 1178),
            (1179, 1179),
            (1180, 1180),
            (1181, 1181),
            (1182, 1182),
            (1183, 1183),
            (1184, 1184)],
           length=1191)


In [12]:
cbi.loc[0]

name                                                    Lacework
valuation                                                  $8.3B
dateJoined                                              1/7/2021
country                                            United States
city                                                    San Jose
industry                                           Cybersecurity
investors      Sutter Hill Ventures, Liberty Global Ventures,...
founded                                                     2015
stage                                                   Series E
totalRaised                                              $1.907B
Name: 0, dtype: object

In [13]:
num_cbi = cbi['name']
np.size(num_cbi)

1185

In [10]:
unique_cbi = cbi['name'].unique()
np.size(unique_cbi)

1182

### Training

In [14]:
train_pairs = pairs[:590]
test_pairs = pairs[590:]

In [15]:
# Colonne che si vogliono confrontare
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.80)
compare.exact('valuation', 'valuation', label='valuation')
compare.exact('dateJoined', 'dateJoined', label='dateJoined')
compare.string('country', 'country', method='jarowinkler', label="country", threshold=0.85)
compare.string('city', 'city', method='jarowinkler', label="city", threshold=0.95)
compare.string('industry', 'industry', method='jarowinkler', label="industry", threshold=0.90)
compare.string('investors', 'investors', method='jarowinkler', label="investors", threshold=0.90)
compare.exact('founded', 'founded', label='founded')
compare.string('stage', 'stage', method='jarowinkler', label="stage", threshold=0.95)
compare.exact('totalRaised', 'totalRaised', label='totalRaised')

<Compare>

In [17]:
# Calcolo funzioni di confronto
training_features = compare.compute(train_pairs, cbi, cbi)

# Colonna score somma i valori di confronto per ogni coppia di record
training_features['score'] = training_features.loc[:, 'name':'totalRaised'].sum(axis=1)

In [27]:
# Seleziona solo le coppie di record che hanno un punteggio maggiore di 2 e reset dell'indice
training_matches = training_features[training_features.sum(axis=1) > 2].reset_index()

# Aggiornamento colonna score con la somma dei valori di confronto per ogni coppia di record selezionate
training_matches['score'] = training_matches.loc[:, 'name':'totalRaised'].sum(axis=1)

toDrop = ['name', 'dateJoined', 'valuation', 'country', 'city', 'industry', 'investors', 'founded', 'stage', 'totalRaised', 'score']
training_matches = training_matches.drop(toDrop, axis=1)
training_matches.head(9)

Unnamed: 0,level_0,level_1
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
5,4,1145
6,1145,4
7,1145,1145
8,5,5


In [28]:
training_matches = pd.MultiIndex.from_frame(training_matches) #matches conversion
training_features.head()

Unnamed: 0,Unnamed: 1,name,valuation,dateJoined,country,city,industry,investors,founded,stage,totalRaised,score
0,0,1.0,1,1,1.0,1.0,1.0,1.0,1,1.0,1,10.0
1,1,1.0,1,1,1.0,1.0,1.0,1.0,1,1.0,1,10.0
2,2,1.0,1,1,1.0,1.0,1.0,1.0,1,1.0,1,10.0
3,3,1.0,1,1,1.0,1.0,1.0,1.0,1,1.0,1,10.0
4,4,1.0,1,1,1.0,1.0,1.0,1.0,1,1.0,1,10.0


In [29]:
len(training_features)

590

In [30]:
len(training_matches)

590

In [31]:
classifier = recordlinkage.NaiveBayesClassifier()
classifier.fit(training_features, training_matches)

### Test 

In [32]:
# Colonne che si vogliono confrontare
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.80)
compare.exact('valuation', 'valuation', label='valuation')
compare.exact('dateJoined', 'dateJoined', label='dateJoined')
compare.string('country', 'country', method='jarowinkler', label="country", threshold=0.85)
compare.string('city', 'city', method='jarowinkler', label="city", threshold=0.95)
compare.string('industry', 'industry', method='jarowinkler', label="industry", threshold=0.90)
compare.string('investors', 'investors', method='jarowinkler', label="investors", threshold=0.90)
compare.exact('founded', 'founded', label='founded')
compare.string('stage', 'stage', method='jarowinkler', label="stage", threshold=0.95)
compare.exact('totalRaised', 'totalRaised', label='totalRaised')

<Compare>

In [33]:
# Calcolo funzioni di confronto
test_features = compare.compute(test_pairs, cbi, cbi)

# Colonna score somma i valori di confronto per ogni coppia di record
test_features['score'] = test_features.loc[:, 'name':'totalRaised'].sum(axis=1)

In [34]:
# Seleziona solo le coppie di record che hanno un punteggio maggiore di 2 e reset dell'indice
test_matches = test_features[test_features.sum(axis=1) > 2].reset_index()

# Aggiornamento colonna score con la somma dei valori di confronto per ogni coppia di record selezionate
test_matches['score'] = test_matches.loc[:, 'name':'totalRaised'].sum(axis=1)

toDrop = ['name', 'dateJoined', 'valuation', 'country', 'city', 'industry', 'investors', 'founded', 'stage', 'totalRaised', 'score']
test_matches = test_matches.drop(toDrop, axis=1)
test_matches.head(9)

Unnamed: 0,level_0,level_1
0,582,582
1,583,583
2,584,584
3,585,585
4,586,586
5,587,587
6,588,588
7,589,589
8,590,590


In [35]:
len(test_features)

601

In [36]:
len(test_matches)

601

In [37]:
test_matches

Unnamed: 0,level_0,level_1
0,582,582
1,583,583
2,584,584
3,585,585
4,586,586
...,...,...
596,1180,1180
597,1181,1181
598,1182,1182
599,1183,1183


In [38]:
test_matches = pd.MultiIndex.from_frame(test_matches) #matches conversion

In [39]:
predictions = classifier.predict(test_features)

### Evaluation

In [40]:
# return the confusion matrix
confusion_matrix = recordlinkage.confusion_matrix(test_matches, predictions, len(test_features))
print('confusion matrix')
print(confusion_matrix)

# compute the F-score for this classification
fscore = recordlinkage.fscore(confusion_matrix)
print('\n\nfscore', fscore)
recall = recordlinkage.recall(test_matches, predictions)
print('recall', recall)
precision = recordlinkage.precision(test_matches, predictions)
print('precision', precision)
accuracy = recordlinkage.accuracy(test_matches, predictions, len(test_features))
print('accuracy', accuracy)

confusion matrix
[[601   0]
 [  0   0]]


fscore 1.0
recall 1.0
precision 1.0
accuracy 1.0


In [41]:
print(len(predictions))

601


In [42]:
false_negatives = test_matches.difference(predictions)
false_negatives

MultiIndex([], )

In [43]:
try:
    fn_from_dfA = false_negatives[0][0]
    fn_from_dfB = false_negatives[0][1]

    display(cbi[cbi.index == fn_from_dfA])
    display(cbi[cbi.index == fn_from_dfB])
except:
    print("No False Negatives Present")

No False Negatives Present
