In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [135]:
# define custom functions for machine learning

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def crossvalidate_classifier(model, X, y, cm=False):
    
    scores = cross_val_score(model, X, y, scoring='f1_macro', cv=5)
    #scores = cross_val_score(model, X, y, scoring='accuracy', cv=5)
    print("%0.2f f-1 score with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
    y_pred = cross_val_predict(model, X, y, cv=5)

    if(cm):
        conf_mat = confusion_matrix(y, y_pred, labels=['fulfillment','other'])
        plot_confusion_matrix(conf_mat, classes = ['fulfillment','other'])
    
def evaluate_classifier(model, X_train, X_test, y_train, y_test):
      
    # ... fit your model here ...
    model.fit(X_train,y_train)

    # Run predict on your tfidf test data to get your predictions
    pred = model.predict(X_test)

    # Calculate your accuracy using the metrics module
    acc_score = metrics.accuracy_score(pred, y_test)
    print("Accuracy Score:   %0.3f" % acc_score)
    
    f1score = metrics.f1_score(pred, y_test, average='macro')
    print("F-1 Score:   %0.3f" % f1score)

    # Calculate the confusion matrices for the tfidf_svc model
    #svc_cm = metrics.confusion_matrix(y_test, pred, labels=['fulfillment','other'])

    # Plot the confusion matrix using the plot_confusion_matrix function
    #plot_confusion_matrix(svc_cm, classes = ['fulfillment','other'], title="Confusion Matrix")
    
    return model

def best_hyperparam(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=5, scoring_fit='f1_score',
                       do_probabilities = False):
    gs = RandomizedSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data, epochs=100)
    
    if do_probabilities:
      pred = fitted_model.predict_proba(X_test_data)
    else:
      pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [2]:
df= pd.read_csv('C:/Users/4022313/OneDrive - MyFedEx/Desktop/Projects/Fullfilment Houses/TI-customers_scraped.csv')

In [335]:
df.head()

Unnamed: 0.1,Unnamed: 0,cntry_enti_nbr,cntry_bus_nm,web_links_cntry,description_cntry,feature,fulfillment
0,0,487211337,"Universal Traffic Service, Inc.","['https://www.utsnet.com/', 'https://www.utsne...",['Universal Traffic Service is the partner of ...,"universal traffic service, inc.['universal tra...",0
1,1,488212349,PLATINUM CARGO LOGISTICS,"['https://www.platinumcargologistics.com/', 'h...",['Intelligent Solutions...Powerful Results Pl...,platinum cargo logistics['intelligent solution...,0
2,2,488487617,"Tazmanian Freight Forwarding, Inc.","['https://www.tazmanian.com/', 'https://www.ta...","[""When your shipping needs have a time-definit...","tazmanian freight forwarding, inc.[""when your ...",1
3,3,485749529,Mesca (Ascent Global Logistics),['https://ascentgl.com/blog/roadrunner-transpo...,['Roadrunner Transportation Systems (“Roadrunn...,mesca (ascent global logistics)['roadrunner tr...,0
4,4,1213806603,GlobalTranz,['https://www.globaltranz.com/shippers/warehou...,"['Seamless Warehouse, Fulfillment and Distribu...","globaltranz['seamless warehouse, fulfillment a...",1


In [353]:
df['feature'] = df['cntry_bus_nm']+ df['description_cntry']
df['feature'] = df['feature'] .str.lower()

df['fulfillment'] = np.where(df['feature'].str.contains('shipping|warehouse|warehousing|fulfillment|3pl|logistic'), 1, 0)

In [354]:
df['fulfillment'].value_counts()

1    991
0    284
Name: fulfillment, dtype: int64

In [355]:
df_fulfillment = df[df['fulfillment']==1]

In [356]:
df_fulfillment.head()

Unnamed: 0.1,Unnamed: 0,cntry_enti_nbr,cntry_bus_nm,web_links_cntry,description_cntry,feature,fulfillment
1,1,488212349,PLATINUM CARGO LOGISTICS,"['https://www.platinumcargologistics.com/', 'h...",['Intelligent Solutions...Powerful Results Pl...,platinum cargo logistics['intelligent solution...,1
2,2,488487617,"Tazmanian Freight Forwarding, Inc.","['https://www.tazmanian.com/', 'https://www.ta...","[""When your shipping needs have a time-definit...","tazmanian freight forwarding, inc.[""when your ...",1
3,3,485749529,Mesca (Ascent Global Logistics),['https://ascentgl.com/blog/roadrunner-transpo...,['Roadrunner Transportation Systems (“Roadrunn...,mesca (ascent global logistics)['roadrunner tr...,1
4,4,1213806603,GlobalTranz,['https://www.globaltranz.com/shippers/warehou...,"['Seamless Warehouse, Fulfillment and Distribu...","globaltranz['seamless warehouse, fulfillment a...",1
5,5,487220833,A.P. MÃ¸ller - MÃ¦rsk A/S,['https://help.shopify.com/en/manual/shipping/...,"[""With Shopify, you can use a fulfillment serv...","a.p. mã¸ller - mã¦rsk a/s[""with shopify, you c...",1


In [357]:
df1= df_fulfillment.copy()
df1= df1[['cntry_bus_nm','feature']]
df1['label']='fulfillment'
df1.columns= ['name', 'description', 'class']

In [296]:
df1.shape

(747, 3)

In [333]:
df.to_csv('C:/Users/4022313/OneDrive - MyFedEx/Desktop/Projects/Fullfilment Houses/TI_scraped1.csv')

In [298]:
df_other = pd.read_csv('C:/Users/4022313/OneDrive - MyFedEx/Desktop/Projects/Sales Vertical Classification/FY2021/archived_03_26_2021/train/t_scraped_clean_1300.csv')

In [299]:
df_other.head()

Unnamed: 0,globl_bus_nm,web_links,description_list
0,Synnex Corporation,"['https://www.synnexcorp.com/', 'http://www.sy...","['Learn More', 'Learn More', ' Synnex Corporat..."
1,"Varian Medical Systems, Inc.","['https://www.varian.com/', 'https://www.varia...","['Were dedicated to forging a new, more unifyi..."
2,Otis Elevator Company,"['https://www.otis.com/', 'https://www.otis.co...",['To give you the best possible browsing exper...
3,General Electric Company,"['https://www.ge.com/', 'https://www.ge.com/ab...","[""Access the latest press releases, media cont..."
4,Duke University,"['https://duke.edu/', 'https://pratt.duke.edu/...",[' Working@Duke News More News from Duke Today...


In [300]:
df_other['feature'] = df_other['globl_bus_nm'] + df_other['description_list']
df_other['feature'] = df_other['feature'] .str.lower()

In [301]:
df_other['discard'] = np.where(df_other['feature'].str.contains('logistic|3pl|fulfillment|transportation|supply chain'),1,0)

In [302]:
df_other['discard'].value_counts()

0    821
1    340
Name: discard, dtype: int64

In [303]:
df2= df_other[df_other['discard']==0].sample(700)

In [304]:
df2 =df2[['globl_bus_nm','feature']]
df2['label'] = 'other'
df2.columns= ['name', 'description', 'class']

In [305]:
print(df1.head(), df2.head())
print(df1.shape, df2.shape)

                                 name  \
2  Tazmanian Freight Forwarding, Inc.   
4                         GlobalTranz   
5           A.P. MÃ¸ller - MÃ¦rsk A/S   
6      CERASIS FOR UNITED CENTRAL SUP   
9                          OIA GLOBAL   

                                         description        class  
2  tazmanian freight forwarding, inc.["when your ...  fulfillment  
4  globaltranz['seamless warehouse, fulfillment a...  fulfillment  
5  a.p. mã¸ller - mã¦rsk a/s["with shopify, you c...  fulfillment  
6  cerasis for united central sup['the future of ...  fulfillment  
9  oia global['whether you require distribution o...  fulfillment                                 name  \
966   H&E Equipment Services, Inc.   
138                Medtronic, Inc.   
1033           Carrier Corporation   
1056               Plaskolite, LLC   
68                The Misumi Group   

                                            description  class  
966   h&e equipment services, inc.["search use my l

In [306]:
df_nlp = pd.concat([df1, df2], axis=0)
df_nlp.head()

Unnamed: 0,name,description,class
2,"Tazmanian Freight Forwarding, Inc.","tazmanian freight forwarding, inc.[""when your ...",fulfillment
4,GlobalTranz,"globaltranz['seamless warehouse, fulfillment a...",fulfillment
5,A.P. MÃ¸ller - MÃ¦rsk A/S,"a.p. mã¸ller - mã¦rsk a/s[""with shopify, you c...",fulfillment
6,CERASIS FOR UNITED CENTRAL SUP,cerasis for united central sup['the future of ...,fulfillment
9,OIA GLOBAL,oia global['whether you require distribution o...,fulfillment


In [334]:
df_nlp.to_csv('C:/Users/4022313/OneDrive - MyFedEx/Desktop/Projects/Fullfilment Houses/TI_nlp.csv')

In [307]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
#nltk.download('wordnet')

In [308]:
stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(lemmatize_stemming(token))
    return str(result)

In [309]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier

In [310]:
df_nlp['tokens'] = df_nlp['description'].map(preprocess)

In [311]:
df_nlp.head()
df_nlp['class'].value_counts()

fulfillment    747
other          700
Name: class, dtype: int64

In [312]:
X = df_nlp['tokens']
y= df_nlp['class']

In [313]:
# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=53, test_size=0.25, shuffle=True, stratify=y )

In [314]:
y_test.head()

206    fulfillment
447          other
142    fulfillment
734    fulfillment
232          other
Name: class, dtype: object

In [315]:
# Initialize count vectorizer
count_vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=0.05)

# Create count train and test variables
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

# Initialize tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=0.05)

# Create tfidf train and test variables
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [316]:
print(count_train.shape, tfidf_train.shape)

(1085, 1084) (1085, 1084)


In [317]:
print(count_vectorizer.vocabulary_)

{'pacif': 685, 'multi': 635, 'brand': 105, 'compani': 179, 'design': 257, 'develop': 260, 'produc': 757, 'market': 598, 'product': 758, 'write': 1078, 'instrument': 503, 'world': 1075, 'sector': 874, 'licens': 558, 'agreement': 34, 'strateg': 951, 'acquisit': 12, 'believ': 91, 'growth': 438, 'strategi': 952, 'uniqu': 1027, 'build': 113, 'effort': 304, 'earn': 292, 'quarter': 782, 'expand': 351, 'season': 871, 'line': 563, 'add': 18, 'new': 647, 'retail': 834, 'leverag': 557, 'expertis': 358, 'stay': 944, 'power': 731, 'lead': 549, 'manufactur': 595, 'consum': 203, 'sell': 880, 'headquart': 448, 'california': 117, 'popular': 724, 'proprietari': 769, 'includ': 482, 'perfect': 704, 'wide': 1069, 'rang': 788, 'inspir': 499, 'featur': 375, 'premier': 736, 'properti': 768, 'generat': 425, 'care': 125, 'commit': 175, 'help': 455, 'posit': 727, 'impact': 477, 'live': 567, 'visit': 1052, 'com': 169, 'follow': 399, 'right': 841, 'reserv': 827, 'leav': 553, 'websit': 1062, 'enter': 323, 'differ':

In [318]:
crossvalidate_classifier(RandomForestClassifier(), tfidf_train, y_train)

0.98 f-1 score with a standard deviation of 0.01


In [319]:
crossvalidate_classifier(GradientBoostingClassifier(), tfidf_train, y_train)

0.98 f-1 score with a standard deviation of 0.01


In [320]:
y_train.head()

644           other
628           other
699           other
849           other
1242    fulfillment
Name: class, dtype: object

In [321]:
print(tfidf_train.shape, tfidf_test.shape, y_train.shape, y_test.shape)

(1085, 1084) (362, 1084) (1085,) (362,)


In [322]:
model = evaluate_classifier(RandomForestClassifier(), tfidf_train, tfidf_test, y_train, y_test)

Accuracy Score:   0.997
F-1 Score:   0.997


In [323]:
df_pred1= df[df['fulfillment']==0]

In [324]:
df_pred1['description'] = df_pred1['cntry_bus_nm']+ df_pred1['description_cntry']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [325]:
df_pred1['tokens'] = df_pred1['description'].map(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [326]:
tfidf_pred1 = tfidf_vectorizer.transform(df_pred1['tokens'])

In [327]:
prediction= model.predict(tfidf_pred1)

In [328]:
print(prediction)

['fulfillment' 'fulfillment' 'fulfillment' 'other' 'other' 'other'
 'fulfillment' 'fulfillment' 'other' 'fulfillment' 'fulfillment' 'other'
 'other' 'other' 'fulfillment' 'other' 'fulfillment' 'other' 'fulfillment'
 'other' 'other' 'fulfillment' 'fulfillment' 'fulfillment' 'other' 'other'
 'fulfillment' 'fulfillment' 'fulfillment' 'fulfillment' 'fulfillment'
 'fulfillment' 'other' 'fulfillment' 'other' 'other' 'other' 'fulfillment'
 'other' 'fulfillment' 'fulfillment' 'fulfillment' 'fulfillment' 'other'
 'fulfillment' 'other' 'fulfillment' 'fulfillment' 'fulfillment' 'other'
 'fulfillment' 'other' 'other' 'other' 'other' 'other' 'other' 'other'
 'other' 'other' 'other' 'fulfillment' 'fulfillment' 'other' 'other'
 'fulfillment' 'fulfillment' 'fulfillment' 'other' 'fulfillment' 'other'
 'fulfillment' 'fulfillment' 'fulfillment' 'other' 'fulfillment' 'other'
 'other' 'other' 'other' 'other' 'other' 'other' 'other' 'fulfillment'
 'other' 'fulfillment' 'fulfillment' 'other' 'other' 'fulfill

In [329]:
df_pred1['class'] = np.asarray(prediction)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [330]:
df_pred1['class'].value_counts()

other          303
fulfillment    225
Name: class, dtype: int64

In [331]:
df_pred1.head()

Unnamed: 0.1,Unnamed: 0,cntry_enti_nbr,cntry_bus_nm,web_links_cntry,description_cntry,feature,fulfillment,description,tokens,class
0,0,487211337,"Universal Traffic Service, Inc.","['https://www.utsnet.com/', 'https://www.utsne...",['Universal Traffic Service is the partner of ...,"universal traffic service, inc.['universal tra...",0,"Universal Traffic Service, Inc.['Universal Tra...","['univers', 'traffic', 'servic', 'univers', 't...",fulfillment
1,1,488212349,PLATINUM CARGO LOGISTICS,"['https://www.platinumcargologistics.com/', 'h...",['Intelligent Solutions...Powerful Results Pl...,platinum cargo logistics['intelligent solution...,0,PLATINUM CARGO LOGISTICS['Intelligent Solution...,"['platinum', 'cargo', 'logist', 'intellig', 's...",fulfillment
3,3,485749529,Mesca (Ascent Global Logistics),['https://ascentgl.com/blog/roadrunner-transpo...,['Roadrunner Transportation Systems (“Roadrunn...,mesca (ascent global logistics)['roadrunner tr...,0,Mesca (Ascent Global Logistics)['Roadrunner Tr...,"['mesca', 'ascent', 'global', 'logist', 'roadr...",fulfillment
7,7,487889127,"NANKAI ELECTRIC RAILWAY CO., LTD.",['https://www.bloomberg.com/profile/company/90...,"[""To continue, please click the box below to l...","nankai electric railway co., ltd.[""to continue...",0,"NANKAI ELECTRIC RAILWAY CO., LTD.[""To continue...","['nankai', 'electr', 'railway', 'continu', 'cl...",other
8,8,488547798,"Mid-America Overseas, Inc.","['https://www.maoinc.com/', 'https://www.maoin...","[' ', ' ']","mid-america overseas, inc.[' ', ' ']",0,"Mid-America Overseas, Inc.[' ', ' ']","['mid', 'america', 'oversea']",other


In [332]:
df_pred1[['cntry_enti_nbr','cntry_bus_nm','class']].to_csv('C:/Users/4022313/OneDrive - MyFedEx/Desktop/Projects/Fullfilment Houses/TI_fulfillment_predicted.csv')

In [336]:
print(df1.columns, df_pred1.columns)

Index(['name', 'description', 'class'], dtype='object') Index(['Unnamed: 0', 'cntry_enti_nbr', 'cntry_bus_nm', 'web_links_cntry',
       'description_cntry', 'feature', 'fulfillment', 'description', 'tokens',
       'class'],
      dtype='object')


In [358]:
df_ff1= df_fulfillment[['cntry_enti_nbr', 'cntry_bus_nm', 'fulfillment']]
df_ff1.columns= ['cntry_enti_nbr', 'cntry_bus_nm', 'class']

df_ff2= df_pred1[['cntry_enti_nbr','cntry_bus_nm','class']]

In [359]:
TI_all = pd.concat([df_ff1, df_ff2], axis=0)
print(TI_all.shape)
print(TI_all.head())

(1519, 3)
   cntry_enti_nbr                        cntry_bus_nm class
1       488212349            PLATINUM CARGO LOGISTICS     1
2       488487617  Tazmanian Freight Forwarding, Inc.     1
3       485749529     Mesca (Ascent Global Logistics)     1
4      1213806603                         GlobalTranz     1
5       487220833           A.P. MÃ¸ller - MÃ¦rsk A/S     1


In [343]:
TI_all.to_csv('C:/Users/4022313/OneDrive - MyFedEx/Desktop/Projects/Fullfilment Houses/TI_fulfillment_all.csv')

In [360]:
TI_all_nodup= TI_all.drop_duplicates('cntry_enti_nbr')
print(TI_all_nodup.shape)

(640, 3)


In [362]:
TI_all_nodup['class'].value_counts()

1              488
other          121
fulfillment     31
Name: class, dtype: int64

In [363]:
TI_nodup=TI_all_nodup.copy()

In [364]:
TI_all_nodup['class'].replace('fulfillment',1, inplace=True)
TI_all_nodup['class'].replace('other',0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [365]:
TI_all_nodup.to_csv('C:/Users/4022313/OneDrive - MyFedEx/Desktop/Projects/Fullfilment Houses/TI_fulfillment_nodup.csv')