To run the following script, install the following:

- pip install python-Levenshtein
- pip install fuzzywuzzy

Levenshtein may require a Microsoft C++ build package through Visual Studio. Follow prompts as required  

In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [4]:
# Standardized shipping methods based primarily upon what is selectable through the FedEx API here:
# https://www.fedex.com/ratefinder/home. 'Home Delivery' and 'Smartpost' are not selectable
# through the API. This is because these services are available to businesses only.
fedex_methods = ['Same Day', 'First Overnight', 'Priority Overnight', 'First Overnight',
                 'Priority Overnight', 'Standard Overnight', '2Day AM', '2Day', 'Express Saver',
                 'Ground','Home Delivery','Smartpost']

# Standardized shipping methods based primarily on what is selectable through the API here:
# https://wwwapps.ups.com/ctc/request?loc=en_US. 'Surepost' and 'Standard' are not selectable
# through the API. Standard is Ground to the 48 continguous states, whereas Ground includes
# Alaska and Hawaii.
ups_methods = ['Next Day Air Early', 'Next Day Air', 'Next Day Air Saver',
               '2nd Day Air A.M.', '2nd Day Air', '3 Day Select', 'Ground', 'Surepost', 'Standard']

In [5]:
# Reads in and labels the data.
data = pd.read_pickle('C:/Users/bjtur/Documents/Capstone/test_merged.pickle')
data.columns = ['year_week', 'business_sid', 'industry', 'sub_industry', 'shipper',
       'service_type', 'package_count', 'weight', 'shipment_date',
       'delivery_date', 'delivery_time', 'freight_charges',
       'freight_discount_amount', 'misc_charges', 'misc_discount_amount',
       'net_charge_amount', 'zone', 'sender_city', 'sender_zip',
       'recipient_city', 'recipient_zip']

In [6]:
data_sample = data.sample(frac=0.1, replace=False)

In [7]:
data_sample = data_sample[data_sample.zone.apply(lambda x: x.isnumeric())]
data_sample = data_sample.astype({'freight_charges':'float64',
                                  'freight_discount_amount':'float64',
                                  'misc_charges':'float64',
                                  'misc_discount_amount':'float64',
                                  'net_charge_amount':'float64',
                                  'zone':'int64'})
data_sample.zone %= 10

In [8]:
# Applies the 'fuzz.partial_ratio' fuzzy macthing algorithm to each record based upon the record's service_type.
# The partial_ratio function is designed to return the shipping method with the highest score as a two pair tuple
# i.e., (standardized_shipping, score)

data_fuzzy_match = []
for row in data_sample[['shipper','service_type']].itertuples():
    if row.shipper == 'fedex':
        data_fuzzy_match.append(process.extractOne(
            row.service_type,
            fedex_methods,
            scorer = fuzz.partial_ratio))
    else:
        data_fuzzy_match.append(process.extractOne(
            row.service_type,
            ups_methods,
            scorer = fuzz.partial_ratio))

In [9]:
print(len(data_sample))
data_sample.insert(6, 'std_service_type', [method for method, score in data_fuzzy_match])
data_sample = data_sample.assign(std_service_type_score = [score for method, score in data_fuzzy_match])
data_sample = data_sample[data_sample.std_service_type_score >= 70]
data_sample = data_sample.drop('std_service_type_score', axis=1)
print(len(data_sample))

372292
365777


In [15]:
a = 1
a.astype(str)

AttributeError: 'int' object has no attribute 'astype'

In [2]:
data = pd.read_pickle('test_merged.pickle')

In [55]:
data.dtypes

year_week                            int64
business_sid                        object
industry                            object
sub_industry                        object
shipper                             object
service_type                        object
std_service_type                    object
package_count                        int64
weight                             float64
shipment_date               datetime64[ns]
delivery_date               datetime64[ns]
delivery_time              timedelta64[ns]
freight_charges                    float64
freight_discount_amount            float64
misc_charges                       float64
misc_discount_amount               float64
net_charge_amount                  float64
zone                                object
sender_city                         object
sender_state                        object
sender_zip                          object
recipient_city                      object
recipient_state                     object
recipient_z

In [53]:
data[(~data.recipient_state.isin(state_abbs)) & (data.recipient_state != '')]
data[(~data.sender_state.isin(state_abbs)) & (data.sender_state != '')]

Unnamed: 0,year_week,business_sid,industry,sub_industry,shipper,service_type,std_service_type,package_count,weight,shipment_date,...,misc_charges,misc_discount_amount,net_charge_amount,zone,sender_city,sender_state,sender_zip,recipient_city,recipient_state,recipient_zip
580322,201828,0C88F8BDE6,,,fedex,ground,Ground,1,35.6,2018-06-20,...,0.0,0.0,29.97,6,STRATFORD,ON,N4Z0A,FORT COLLINS,CO,80524
586425,201836,D86C057DE2,RETAIL,ECOMMERCE,fedex,ground,Ground,1,36.0,2018-08-16,...,1.72,0.0,29.24,6,CORNWALLIS,NS,B0S1H,SPRINGFIELD,MO,65802
586406,201836,D86C057DE2,RETAIL,ECOMMERCE,fedex,ground,Ground,1,36.0,2018-08-16,...,1.72,0.0,29.24,6,CORNWALLIS,NS,B0S1H,SPRINGFIELD,MO,65802
1611824,201838,73C6D0DBE2,RETAIL,MISC RETAIL,fedex,ground,Ground,1,20.1,2018-08-29,...,0.0,0.0,17.36,2,CALGARY,AB,T2J5Y,MEDLEY,FL,33178
586423,201836,D86C057DE2,RETAIL,ECOMMERCE,fedex,ground,Ground,1,40.0,2018-08-16,...,1.88,0.0,31.96,6,CORNWALLIS,NS,B0S1H,SPRINGFIELD,MO,65802
586418,201836,D86C057DE2,RETAIL,ECOMMERCE,fedex,ground,Ground,1,36.0,2018-08-16,...,1.72,0.0,29.24,6,CORNWALLIS,NS,B0S1H,SPRINGFIELD,MO,65802
683312,201840,0309614A6E,OTHER,OTHER,fedex,home delivery,Home Delivery,1,53.0,2018-09-13,...,28.44,0.0,63.94,6,PARKSVILLE,BC,V9P2W,SEDONA,AZ,86336
1001770,201849,66FD113B5A,OTHER,OTHER,fedex,ground,Ground,1,42.0,2018-11-19,...,10.29,0.0,24.88,4,OAKVILLE,ON,L6J7Z,CONCORD,NC,28027
1308835,201849,4A69F06503,,,ups,ups surepost - less than 1 lb\t2\t3,Surepost,1,10.9,2018-11-26,...,3.14,0.26,8.23,4,EASTON,US,61265,STERLING,OH,44276
1380,201901,GVA9UDFMG6,TRANSPORTATION,MOTOR FREIGHT TRANSPORTATION/WAREHOUSE,fedex,ground,Ground,1,15.0,2018-12-14,...,2.25,0.0,24.83,8,MARKHAM,ON,L6G1A,BOTHELL,WA,98021


In [22]:
# Standardized state names and codes of the 48 contiguous states based upon USPS standards found here:
# https://www.ups.com/worldshiphelp/WS14/ENU/AppHelp/Codes/State_Province_Codes.htm
state_names_to_codes = {'Alabama':'AL', 'Arizona':'AZ', 'Arkansas':'AR', 'Armed Forces America':'AA',
 'Armed Forces Europe':'AE', 'Armed Forces Pacific':'AP', 'California':'CA', 'Colorado':'CO',
 'Connecticut':'CT', 'Delaware':'DE', 'District of Columbia':'DC', 'Florida':'FL', 'Georgia':'GA',
 'Hawaii':'HI', 'Idaho':'ID', 'Illinois':'IL', 'Indiana':'IN', 'Iowa':'IA', 'Kansas':'KS', 'Kentucky':'KY',
 'Louisiana':'LA', 'Maine':'ME', 'Maryland':'MD', 'Massachusetts':'MA', 'Michigan':'MI', 'Minnesota':'MN',
 'Mississippi':'MS', 'Missouri':'MO', 'Montana':'MT', 'Nebraska':'NE', 'Nevada':'NV', 'New Hampshire':'NH',
 'New Jersey':'NJ', 'New Mexico':'NM', 'New York':'NY', 'North Carolina':'NC', 'North Dakota':'ND',
 'Ohio':'OH', 'Oklahoma':'OK', 'Oregon':'OR', 'Pennsylvania':'PA', 'Rhode Island':'RI', 'South Carolina':'SC',
 'South Dakota':'SD', 'Tennessee':'TN', 'Texas':'TX', 'Utah':'UT', 'Vermont':'VT', 'Virginia':'VA',
 'Washington':'WA', 'West Virginia':'WV', 'Wisconsin':'WI', 'Wyoming':'WY'}
state_names = list(state_names_to_codes.keys())
state_codes = list(state_names_to_codes.values())

In [23]:
state_names = list(state_name_to_abb.keys())

In [24]:
state_names

['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'Armed Forces America',
 'Armed Forces Europe',
 'Armed Forces Pacific',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'District of Columbia',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming']

In [25]:
state_abbs = list(state_names_to_abbs.values())

In [26]:
state_abbs

['AL',
 'AK',
 'AZ',
 'AR',
 'AA',
 'AE',
 'AP',
 'CA',
 'CO',
 'CT',
 'DE',
 'DC',
 'FL',
 'GA',
 'HI',
 'ID',
 'IL',
 'IN',
 'IA',
 'KS',
 'KY',
 'LA',
 'ME',
 'MD',
 'MA',
 'MI',
 'MN',
 'MS',
 'MO',
 'MT',
 'NE',
 'NV',
 'NH',
 'NJ',
 'NM',
 'NY',
 'NC',
 'ND',
 'OH',
 'OK',
 'OR',
 'PA',
 'RI',
 'SC',
 'SD',
 'TN',
 'TX',
 'UT',
 'VT',
 'VA',
 'WA',
 'WV',
 'WI',
 'WY']