In [68]:
import re
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct
from sklearn.feature_extraction.text import TfidfVectorizer

import mwdsbe
import mwdsbe.datasets.licenses as licenses
import schuylkill as skool

import time

In [2]:
registry = mwdsbe.load_registry() # geopandas df
license = licenses.CommercialActivityLicenses().get()

In [65]:
# clean data
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service', 'corporation']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore_words)
cleaned_license = skool.clean_strings(license, ['company_name'], True, ignore_words)

cleaned_registry = cleaned_registry.dropna(subset=['company_name']).drop_duplicates()
cleaned_license = cleaned_license.dropna(subset=['company_name'])

In [75]:
len(cleaned_registry)

3119

In [76]:
len(cleaned_license)

203541

In [92]:
def tf_idf_merge(
    left: pd.DataFrame,
    right: pd.DataFrame,
    on: str = None,
    left_on: str = None,
    right_on: str = None,
    score_cutoff: int = 90,
    max_matches=1,
    suffixes=("_x", "_y"),
):
    if on is not None:
        left_on = right_on = on

    # Verify input parameters
    if left_on is None or right_on is None:
        raise ValueError("Please specify `on` or `left_on/right_on`")
    if left_on not in left.columns:
        raise ValueError(f"'{left_on}' is not a column in `left`")
    if right_on not in right.columns:
        raise ValueError(f"'{right_on}' is not a column in `right`")

    # get the left and right strings
    left_data = left[left_on].dropna().astype(str)
    right_data = right[right_on].dropna().astype(str).rename_axis("right_index")

    # Merge together into single Series
    all_data = pd.concat([left_data, right_data], axis=0)
    all_data_unique = pd.concat(
        [left_data.drop_duplicates(), right_data], axis=0
    )

    # save the index and then reset it
    unique_index = all_data_unique.index
    all_data_unique = all_data_unique.reset_index(drop=True)

    # Do the TF-IDF vectorization
    vectorizer = TfidfVectorizer(min_df=1, analyzer=_ngrams)
    tf_idf_matrix = vectorizer.fit_transform(all_data_unique.values)

    # Get the matches as a sparse matrix
    matches = _fast_cossim_top(
        tf_idf_matrix,
        tf_idf_matrix.transpose(),
        ntop=max_matches + 1,
        lower_bound=score_cutoff / 100,
    )

    # Format the matches into a DataFrame
    left_size = len(left_data)
    matches_df = _format_matches(matches, all_data_unique, unique_index, left_size)

    # Merge in the right
    matches_df = (
        pd.merge(
            left,
            pd.merge(
                matches_df,
                right.rename_axis("right_index").reset_index(),
                on="right_index",
            ).set_index("left_index"),
            how="left",
            left_index=True,
            right_index=True,
            suffixes=suffixes,
        )
        .drop(labels=["left_side", "right_side"], axis=1)
        .rename(columns={"similarity": "match_probability"})
    )

    return matches_df

In [5]:
def _ngrams(string, n=3):
    """
    Calculate n-grams for the input string.
    """
    string = re.sub(r"[,-./]|\sBD", r"", string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return ["".join(ngram) for ngram in ngrams]

In [6]:
def _fast_cossim_top(A, B, ntop, lower_bound=0):
    """
    Calculate the cosine similarity for the top matches.
    """
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape

    idx_dtype = np.int32

    nnz_max = M * ntop

    indptr = np.zeros(M + 1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M,
        N,
        np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr,
        indices,
        data,
    )

    return csr_matrix((data, indices, indptr), shape=(M, N))

In [7]:
def _format_matches(sparse_matrix, name_vector_unique, unique_index, left_size):
    """
    Internal function to format the sparse matrix of matches 
    into a pandas DataFrame.
    """
    non_zeros = sparse_matrix.nonzero()

    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    nr_matches = sparsecols.size

    out = []

    for index in range(0, nr_matches):

        # the left/right string match
        left_side = name_vector_unique.iloc[sparserows[index]]
        right_side = name_vector_unique.iloc[sparsecols[index]]

        # the index in name vector
        lidx = name_vector_unique.index[sparserows[index]]
        ridx = name_vector_unique.index[sparsecols[index]]

        # the original index
        left_index = unique_index[lidx]
        right_index = unique_index[ridx]

        # similarity
        similarity = sparse_matrix.data[index]

        # FIXME
        if lidx != ridx and lidx < left_size and ridx > left_size:
            out.append([left_index, right_index, left_side, right_side, similarity])

    return pd.DataFrame(
        out,
        columns=["left_index", "right_index", "left_side", "right_side", "similarity"],
    )

In [10]:
test_registry = cleaned_registry.iloc[51].to_frame().T

In [11]:
test_license = cleaned_license[cleaned_license.company_name == 'abc construction']

In [12]:
test_license

Unnamed: 0,license_num,issue_date,license_status,company_name
1635,10250.0,1994-08-25 00:00:00+00:00,Active,abc construction
22670,107179.0,2001-02-28 00:00:00+00:00,Active,abc construction
26275,144101.0,2002-04-05 00:00:00+00:00,Active,abc construction
202694,824402.0,2019-09-20 00:00:00+00:00,Active,abc construction


In [128]:
merged = tf_idf_merge(test_registry, test_license, on="company_name", score_cutoff=85)

In [129]:
merged

Unnamed: 0,company_name_x,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,location_standard,lat,lng,geometry,right_index,match_probability,license_num,issue_date,license_status,company_name_y
51,abc construction,,KIMBERLY,NUGENT,714 Dunksferry Rd.,Bensalem,PA,19020,714 Dunksferry Rd.,Bensalem,...,,,,,202694,1.0,824402.0,2019-09-20 00:00:00+00:00,Active,abc construction
51,abc construction,,KIMBERLY,NUGENT,714 Dunksferry Rd.,Bensalem,PA,19020,714 Dunksferry Rd.,Bensalem,...,,,,,26275,1.0,144101.0,2002-04-05 00:00:00+00:00,Active,abc construction


In [130]:
merged = drop_duplicates_by_date(merged, "issue_date")

In [131]:
merged

Unnamed: 0,company_name_x,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,location_standard,lat,lng,geometry,right_index,match_probability,license_num,issue_date,license_status,company_name_y
51,abc construction,,KIMBERLY,NUGENT,714 Dunksferry Rd.,Bensalem,PA,19020.0,714 Dunksferry Rd.,Bensalem,...,,,,,202694,1.0,824402.0,2019-09-20 00:00:00+00:00,Active,abc construction


In [62]:
## full data

In [86]:
t1 = time.time()
full_merged = tf_idf_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=85)
t = time.time() - t1

In [87]:
print('Execution time:', t/60, 'min')

Execution time: 1.7626996596654256 min


In [88]:
matched = full_merged.dropna(subset=['company_name_y'])

In [89]:
print('Match:', len(matched), 'out of', len(cleaned_registry))

Match: 1612 out of 3119


In [90]:
matched

Unnamed: 0,company_name_x,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,location_standard,lat,lng,geometry,right_index,match_probability,license_num,issue_date,license_status,company_name_y
0,119 degrees architects,,Rafael,Utrera,"1503 Green Street, Suite # 4",Philadelphia,PA,19130.0,"1503 Green Street, Suite # 4",Philadelphia,...,1503 GREEN ST,39.964275,-75.163042,POINT (-75.16304 39.96427),131105.0,1.000000,480115.0,2009-07-31 00:00:00+00:00,Active,119 degrees architects
7,24 hour cleaning,,Mary Colleen,Zoltowski,14005 Barcalow Street,Philadelphia,PA,19116.0,14005 Barcalow Street,Philadelphia,...,14005 BARCALOW ST,40.131349,-75.014284,POINT (-75.01428 40.13135),17320.0,1.000000,120711.0,1999-08-30 00:00:00+00:00,Active,24 hour cleaning
8,259 strategies,,Chaka,"Fattah, Jr.","Two Logan Square, Suite 1900",Philadelphia,PA,19103.0,"Two Logan Square, Suite 1900",Philadelphia,...,100-20 N 18TH ST,39.955726,-75.169784,POINT (-75.16978 39.95573),102422.0,1.000000,346817.0,2005-11-28 00:00:00+00:00,Active,259 strategies
15,521 management,,Kris,Bowman,"1000 1st Avenue, Suite 104",King Of Prussia,PA,19406.0,"1000 1st Avenue, Suite 104",King Of Prussia,...,,,,,26009.0,1.000000,136428.0,2002-03-08 00:00:00+00:00,Active,521 management
16,6 degrees consulting,,Robert,Lawson,"6545 Hamiton Avenue, Suite 1A",Ptttsburgh,PA,15206.0,"6545 Hamiton Avenue, Suite 1A",Ptttsburgh,...,,,,,171401.0,1.000000,681851.0,2015-10-26 00:00:00+00:00,Active,6 degrees consulting
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3110,zenga engineering,,Gwendolyn,Lodise,313 East Broad Street,Palmyra,NJ,8065.0,313 East Broad Street,Palmyra,...,,,,,136814.0,1.000000,509641.0,2010-07-26 00:00:00+00:00,Active,zenga engineering
3113,zizza highway,,Arlene,Zizza,382 CONCHESTER HIGHWAY,Glen Mills,PA,19342.0,382 CONCHESTER HIGHWAY,Glen Mills,...,,,,,189864.0,1.000000,770177.0,2018-03-23 00:00:00+00:00,Active,zizza highway
3113,zizza highway,,Arlene,Zizza,382 CONCHESTER HIGHWAY,Glen Mills,PA,19342.0,382 CONCHESTER HIGHWAY,Glen Mills,...,,,,,30230.0,1.000000,161685.0,2003-05-09 00:00:00+00:00,Active,zizza highway
3115,zones,,Mr.,Lalji,1102 15TH ST SW,Auburn,WA,98001.0,1102 15TH ST SW,Auburn,...,,,,,26525.0,0.860842,145187.0,2002-05-06 00:00:00+00:00,Active,zone


In [132]:
matched = drop_duplicates_by_date(matched, "issue_date")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [133]:
matched

Unnamed: 0,company_name_x,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,location_standard,lat,lng,geometry,right_index,match_probability,license_num,issue_date,license_status,company_name_y
0,119 degrees architects,,Rafael,Utrera,"1503 Green Street, Suite # 4",Philadelphia,PA,19130.0,"1503 Green Street, Suite # 4",Philadelphia,...,1503 GREEN ST,39.964275,-75.163042,POINT (-75.16304 39.96427),131105.0,1.000000,480115.0,2009-07-31 00:00:00+00:00,Active,119 degrees architects
7,24 hour cleaning,,Mary Colleen,Zoltowski,14005 Barcalow Street,Philadelphia,PA,19116.0,14005 Barcalow Street,Philadelphia,...,14005 BARCALOW ST,40.131349,-75.014284,POINT (-75.01428 40.13135),17320.0,1.000000,120711.0,1999-08-30 00:00:00+00:00,Active,24 hour cleaning
8,259 strategies,,Chaka,"Fattah, Jr.","Two Logan Square, Suite 1900",Philadelphia,PA,19103.0,"Two Logan Square, Suite 1900",Philadelphia,...,100-20 N 18TH ST,39.955726,-75.169784,POINT (-75.16978 39.95573),102422.0,1.000000,346817.0,2005-11-28 00:00:00+00:00,Active,259 strategies
15,521 management,,Kris,Bowman,"1000 1st Avenue, Suite 104",King Of Prussia,PA,19406.0,"1000 1st Avenue, Suite 104",King Of Prussia,...,,,,,26009.0,1.000000,136428.0,2002-03-08 00:00:00+00:00,Active,521 management
16,6 degrees consulting,,Robert,Lawson,"6545 Hamiton Avenue, Suite 1A",Ptttsburgh,PA,15206.0,"6545 Hamiton Avenue, Suite 1A",Ptttsburgh,...,,,,,171401.0,1.000000,681851.0,2015-10-26 00:00:00+00:00,Active,6 degrees consulting
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3107,zavorski masonry restoration,,Lisa,Zavorski,717 Ford Avenue,Langhorne,PA,19047.0,717 Ford Avenue,Langhorne,...,,,,,101652.0,1.000000,343160.0,2005-10-31 00:00:00+00:00,Active,zavorski masonry restoration
3110,zenga engineering,,Gwendolyn,Lodise,313 East Broad Street,Palmyra,NJ,8065.0,313 East Broad Street,Palmyra,...,,,,,136814.0,1.000000,509641.0,2010-07-26 00:00:00+00:00,Active,zenga engineering
3113,zizza highway,,Arlene,Zizza,382 CONCHESTER HIGHWAY,Glen Mills,PA,19342.0,382 CONCHESTER HIGHWAY,Glen Mills,...,,,,,189864.0,1.000000,770177.0,2018-03-23 00:00:00+00:00,Active,zizza highway
3115,zones,,Mr.,Lalji,1102 15TH ST SW,Auburn,WA,98001.0,1102 15TH ST SW,Auburn,...,,,,,26525.0,0.860842,145187.0,2002-05-06 00:00:00+00:00,Active,zone


In [127]:
def drop_duplicates_by_date(df, column):
    df.sort_values(by=column, ascending=False, inplace=True)
    df = df.groupby(df.index).first()
    return df