In [1]:
import mwdsbe
import schuylkill as skool
import pandas as pd
import time

In [2]:
from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
## Functions
def drop_duplicates_by_date(df, date_column):
    df.sort_values(by=date_column, ascending=False, inplace=True)
    df = df.loc[~df.index.duplicated(keep="first")]
    df.sort_index(inplace=True)
    return df

## Data

In [4]:
registry = mwdsbe.load_registry() # geopandas df

In [5]:
# contract payments in fy 2017
payments = pd.read_csv(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe\mwdsbe\data\Payments\city_payments_detailed_2017.csv')

In [6]:
len(payments)

238894

Clean Data

In [7]:
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service', 'corporation']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore_words)
cleaned_payments = skool.clean_strings(payments, ['vendor_name'], True, ignore_words)

cleaned_registry = cleaned_registry.dropna(subset=['company_name'])
cleaned_payments = cleaned_payments.dropna(subset=['vendor_name'])

In [8]:
len(cleaned_registry)

3119

In [12]:
len(cleaned_payments)

238894

In [33]:
len(cleaned_payments.vendor_name.unique()) # number of unique vendors

5493

MWDSBE Payments Data

In [23]:
mwdsbe_payments = pd.read_excel(r'C:\Users\dabinlee\Desktop\mwdsbe\data\payments\tf-idf-85.xlsx')

In [24]:
mwdsbe_payments.rename(columns={'Unnamed: 0': 'left_index'}, inplace=True)
mwdsbe_payments.set_index('left_index', inplace=True)

In [30]:
len(mwdsbe_payments.vendor_name.unique())

207

In [32]:
207 / 5506 * 100 # percentage of minority vendor number in payments data

3.759535052669815

## Analysis

### Percentage of payments for MWDSBE

In [175]:
mwdsbe_payments = pd.read_excel(r'C:\Users\dabinlee\Desktop\mwdsbe\data\payments\tf-idf-85.xlsx')

In [176]:
mwdsbe_payments.rename(columns={'Unnamed: 0': 'left_index'}, inplace=True)
mwdsbe_payments.set_index('left_index', inplace=True)

In [177]:
len(mwdsbe_payments)

369

Filter payments to only have 1-2 classes

In [178]:
mwdsbe_payments = mwdsbe_payments.loc[mwdsbe_payments['char_'].isin([1,2])]

Get all payments of mwdsbe
* current mwdsbe_payments data only contains 1-2 payments

In [179]:
mwdsbe_vendors = mwdsbe_payments.vendor_name.unique()

In [180]:
len(mwdsbe_vendors)

130

In [181]:
all_mwdsbe_payments = cleaned_payments.loc[cleaned_payments['vendor_name'].isin(mwdsbe_vendors)]

In [182]:
len(all_mwdsbe_payments)

5670

Total Transaction Amount of MWDSBE

In [183]:
tot_amt_MWDSBE = all_mwdsbe_payments['transaction_amount'].sum()

Total Transaction Amount of all business payments

In [184]:
tot_amt_all = all_payments['transaction_amount'].sum()

In [185]:
# percentage
tot_amt_MWDSBE / tot_amt_all * 100

2.3796189968448065

### Comparing top 10 Locations of OEO report and registry and matched

#### Registry

In Philadelphia

In [103]:
philadelphia = cleaned_registry.loc[cleaned_registry.local]

In [104]:
len(philadelphia)/len(cleaned_registry) * 100

29.272202629047772

Other PA cities

In [154]:
cleaned_registry_df = pd.DataFrame(cleaned_registry.drop(labels=['geometry'], axis=1))

In [155]:
other_PA_cities = cleaned_registry_df.loc[((~cleaned_registry_df.out_of_state) & (~cleaned_registry_df.local))]

In [156]:
len(other_PA_cities)/len(cleaned_registry) * 100

27.540878486694453

#### Matched

In [191]:
philadelphia = mwdsbe_payments.loc[mwdsbe_payments.local]

In [192]:
len(philadelphia)/len(mwdsbe_payments) * 100

46.05263157894737