In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Step tokenize


In [3]:
# stop word removal using nltk
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 


# sample sentence
text = """He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and 
fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had 
indeed the vaguest idea where the wood and river in question were."""

# set of stop words
stop_words = set(stopwords.words('english')) 

# tokens of words  
word_tokens = word_tokenize(text) 
print(word_tokens)

['He', 'determined', 'to', 'drop', 'his', 'litigation', 'with', 'the', 'monastry', ',', 'and', 'relinguish', 'his', 'claims', 'to', 'the', 'wood-cuting', 'and', 'fishery', 'rihgts', 'at', 'once', '.', 'He', 'was', 'the', 'more', 'ready', 'to', 'do', 'this', 'becuase', 'the', 'rights', 'had', 'become', 'much', 'less', 'valuable', ',', 'and', 'he', 'had', 'indeed', 'the', 'vaguest', 'idea', 'where', 'the', 'wood', 'and', 'river', 'in', 'question', 'were', '.']


### Step : Stop Word Removal </br>
What are Stopwords?- “the”, “is”, “in”, “for”, “where”, “when”, “to”, “at” etc.

In [4]:
filtered_sentence = [] 
  
for w in word_tokens: 
    if w not in stop_words: 
      filtered_sentence.append(w) 

print("\n\nOriginal Sentence \n\n")
print(" ".join(word_tokens)) 

print("\n\nFiltered Sentence \n\n")
print(" ".join(filtered_sentence)) 



Original Sentence 


He determined to drop his litigation with the monastry , and relinguish his claims to the wood-cuting and fishery rihgts at once . He was the more ready to do this becuase the rights had become much less valuable , and he had indeed the vaguest idea where the wood and river in question were .


Filtered Sentence 


He determined drop litigation monastry , relinguish claims wood-cuting fishery rihgts . He ready becuase rights become much less valuable , indeed vaguest idea wood river question .


In [5]:
# stopword removal using gensim
from gensim.parsing.preprocessing import remove_stopwords

# pass the sentence in the remove_stopwords function
result = remove_stopwords("""He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, 
and he had indeed the vaguest idea where the wood and river in question were.""")

print('\n\n Filtered Sentence \n\n')
print(result)



 Filtered Sentence 


He determined drop litigation monastry, relinguish claims wood-cuting fishery rihgts once. He ready becuase rights valuable, vaguest idea wood river question were.


Details about the different libraries to remove stopwords [here](https://towardsdatascience.com/text-pre-processing-stop-words-removal-using-different-libraries-f20bac19929a).

### Stemming & Lemmetization </br>
Stemming- is a text normalization technique that cuts off the end or beginning of a word / strip suffixes </br>
Lemmetization - procedure of obtaining the root form of the word </br>


In [7]:
from nltk.stem import PorterStemmer

Stem_words = []
ps =PorterStemmer()
for w in filtered_sentence:
    rootWord=ps.stem(w)
    Stem_words.append(rootWord)
print(filtered_sentence)
print(Stem_words)

['He', 'determined', 'drop', 'litigation', 'monastry', ',', 'relinguish', 'claims', 'wood-cuting', 'fishery', 'rihgts', '.', 'He', 'ready', 'becuase', 'rights', 'become', 'much', 'less', 'valuable', ',', 'indeed', 'vaguest', 'idea', 'wood', 'river', 'question', '.']
['he', 'determin', 'drop', 'litig', 'monastri', ',', 'relinguish', 'claim', 'wood-cut', 'fisheri', 'rihgt', '.', 'he', 'readi', 'becuas', 'right', 'becom', 'much', 'less', 'valuabl', ',', 'inde', 'vaguest', 'idea', 'wood', 'river', 'question', '.']


In [8]:
import nltk
from nltk.stem import WordNetLemmatizer

lemma_word = []
wordnet_lemmatizer = WordNetLemmatizer()
# v stands for verb, a stands for adjective and n stands for noun
for w in filtered_sentence:
    word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
    word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
    word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
    lemma_word.append(word3)
print(lemma_word)

['He', 'determine', 'drop', 'litigation', 'monastry', ',', 'relinguish', 'claim', 'wood-cuting', 'fishery', 'rihgts', '.', 'He', 'ready', 'becuase', 'right', 'become', 'much', 'le', 'valuable', ',', 'indeed', 'vague', 'idea', 'wood', 'river', 'question', '.']


### data preperation for ML

In [11]:
# read target class data
df= pd.read_csv('TI-customers_scraped.csv', low_memory=False)

In [10]:
df.head()

Unnamed: 0,bus_nm,web_links_cntry,description_cntry,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13737,Unnamed: 13738,Unnamed: 13739,Unnamed: 13740,Unnamed: 13741,Unnamed: 13742,Unnamed: 13743,Unnamed: 13744,Unnamed: 13745,Unnamed: 13746
0,"Universal Traffic Service, Inc.","['https://www.utsnet.com/', 'https://www.utsne...",['Universal Traffic Service is the partner of ...,,,,,,,,...,,,,,,,,,,
1,PLATINUM CARGO LOGISTICS,"['https://www.platinumcargologistics.com/', 'h...",['Intelligent Solutions...Powerful Results Pl...,,,,,,,,...,,,,,,,,,,
2,"Tazmanian Freight Forwarding, Inc.","['https://www.tazmanian.com/', 'https://www.ta...","[""When your shipping needs have a time-definit...",,,,,,,,...,,,,,,,,,,
3,Mesca (Ascent Global Logistics),['https://ascentgl.com/blog/roadrunner-transpo...,['Roadrunner Transportation Systems (“Roadrunn...,,,,,,,,...,,,,,,,,,,
4,GlobalTranz,['https://www.globaltranz.com/shippers/warehou...,"['Seamless Warehouse, Fulfillment and Distribu...",,,,,,,,...,,,,,,,,,,


In [13]:
from nltk.stem import SnowballStemmer

# nlp pre-processing
stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(lemmatize_stemming(token))
    return str(result)

In [14]:
df['feature'] = df['bus_nm']+ df['description_cntry']
df['feature'] = df['feature'] .str.lower()

df['fulfillment'] = np.where(df['feature'].str.contains('shipping|warehouse|warehousing|fulfillment|3pl|logistic'), 1, 0)

In [16]:
df['fulfillment'].value_counts()

1    3242
0     305
Name: fulfillment, dtype: int64

In [17]:
df_fulfillment = df[df['fulfillment']==1]

In [18]:
df_fulfillment.head()

Unnamed: 0,bus_nm,web_links_cntry,description_cntry,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13739,Unnamed: 13740,Unnamed: 13741,Unnamed: 13742,Unnamed: 13743,Unnamed: 13744,Unnamed: 13745,Unnamed: 13746,feature,fulfillment
1,PLATINUM CARGO LOGISTICS,"['https://www.platinumcargologistics.com/', 'h...",['Intelligent Solutions...Powerful Results Pl...,,,,,,,,...,,,,,,,,,platinum cargo logistics['intelligent solution...,1
2,"Tazmanian Freight Forwarding, Inc.","['https://www.tazmanian.com/', 'https://www.ta...","[""When your shipping needs have a time-definit...",,,,,,,,...,,,,,,,,,"tazmanian freight forwarding, inc.[""when your ...",1
3,Mesca (Ascent Global Logistics),['https://ascentgl.com/blog/roadrunner-transpo...,['Roadrunner Transportation Systems (“Roadrunn...,,,,,,,,...,,,,,,,,,mesca (ascent global logistics)['roadrunner tr...,1
4,GlobalTranz,['https://www.globaltranz.com/shippers/warehou...,"['Seamless Warehouse, Fulfillment and Distribu...",,,,,,,,...,,,,,,,,,"globaltranz['seamless warehouse, fulfillment a...",1
5,A.P. MÃ¸ller - MÃ¦rsk A/S,['https://help.shopify.com/en/manual/shipping/...,"[""With Shopify, you can use a fulfillment serv...",,,,,,,,...,,,,,,,,,"a.p. mã¸ller - mã¦rsk a/s[""with shopify, you c...",1


In [19]:
df1= df_fulfillment.copy()
df1= df1[['bus_nm','feature']]
df1['label']='fulfillment'
df1.columns= ['name', 'description', 'class']

In [20]:
df_other = pd.read_csv('t_scraped_clean_1300.csv')

In [21]:
df_other.head()

Unnamed: 0,globl_bus_nm,web_links,description_list
0,Synnex Corporation,"['https://www.synnexcorp.com/', 'http://www.sy...","['Learn More', 'Learn More', ' Synnex Corporat..."
1,"Varian Medical Systems, Inc.","['https://www.varian.com/', 'https://www.varia...","['Were dedicated to forging a new, more unifyi..."
2,Otis Elevator Company,"['https://www.otis.com/', 'https://www.otis.co...",['To give you the best possible browsing exper...
3,General Electric Company,"['https://www.ge.com/', 'https://www.ge.com/ab...","[""Access the latest press releases, media cont..."
4,Duke University,"['https://duke.edu/', 'https://pratt.duke.edu/...",[' Working@Duke News More News from Duke Today...


In [22]:
df_other['feature'] = df_other['globl_bus_nm'] + df_other['description_list']
df_other['feature'] = df_other['feature'] .str.lower()

In [23]:
df_other['discard'] = np.where(df_other['feature'].str.contains('logistic|3pl|fulfillment|transportation|supply chain'),1,0)

In [24]:
df_other['discard'].value_counts()

0    821
1    407
Name: discard, dtype: int64

In [25]:
df2= df_other[df_other['discard']==0].sample(700)

In [26]:
df2 =df2[['globl_bus_nm','feature']]
df2['label'] = 'other'
df2.columns= ['name', 'description', 'class']

In [27]:
print(df1.head(), df2.head())
print(df1.shape, df2.shape)

                                 name  \
1            PLATINUM CARGO LOGISTICS   
2  Tazmanian Freight Forwarding, Inc.   
3     Mesca (Ascent Global Logistics)   
4                         GlobalTranz   
5           A.P. MÃ¸ller - MÃ¦rsk A/S   

                                         description        class  
1  platinum cargo logistics['intelligent solution...  fulfillment  
2  tazmanian freight forwarding, inc.["when your ...  fulfillment  
3  mesca (ascent global logistics)['roadrunner tr...  fulfillment  
4  globaltranz['seamless warehouse, fulfillment a...  fulfillment  
5  a.p. mã¸ller - mã¦rsk a/s["with shopify, you c...  fulfillment                                       name  \
457              Synovus Financial Corp.   
947                        Zimmer Biomet   
788  Fresenius Medical Care AG & Co KGaA   
393                      Globe Life Inc.   
866            Novartis International AG   

                                           description  class  
457  synovus fin

In [28]:
df_nlp = pd.concat([df1, df2], axis=0)
df_nlp.head()

Unnamed: 0,name,description,class
1,PLATINUM CARGO LOGISTICS,platinum cargo logistics['intelligent solution...,fulfillment
2,"Tazmanian Freight Forwarding, Inc.","tazmanian freight forwarding, inc.[""when your ...",fulfillment
3,Mesca (Ascent Global Logistics),mesca (ascent global logistics)['roadrunner tr...,fulfillment
4,GlobalTranz,"globaltranz['seamless warehouse, fulfillment a...",fulfillment
5,A.P. MÃ¸ller - MÃ¦rsk A/S,"a.p. mã¸ller - mã¦rsk a/s[""with shopify, you c...",fulfillment


In [29]:
# define custom functions for machine learning

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def crossvalidate_classifier(model, X, y, cm=False):
    
    scores = cross_val_score(model, X, y, scoring='f1_macro', cv=5)
    #scores = cross_val_score(model, X, y, scoring='accuracy', cv=5)
    print("%0.2f f-1 score with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
    y_pred = cross_val_predict(model, X, y, cv=5)

    if(cm):
        conf_mat = confusion_matrix(y, y_pred, labels=['fulfillment','other'])
        plot_confusion_matrix(conf_mat, classes = ['fulfillment','other'])
    
def evaluate_classifier(model, X_train, X_test, y_train, y_test):
      
    # ... fit your model here ...
    model.fit(X_train,y_train)

    # Run predict on your tfidf test data to get your predictions
    pred = model.predict(X_test)

    # Calculate your accuracy using the metrics module
    acc_score = metrics.accuracy_score(pred, y_test)
    print("Accuracy Score:   %0.3f" % acc_score)
    
    f1score = metrics.f1_score(pred, y_test, average='macro')
    print("F-1 Score:   %0.3f" % f1score)

    # Calculate the confusion matrices for the tfidf_svc model
    #svc_cm = metrics.confusion_matrix(y_test, pred, labels=['fulfillment','other'])

    # Plot the confusion matrix using the plot_confusion_matrix function
    #plot_confusion_matrix(svc_cm, classes = ['fulfillment','other'], title="Confusion Matrix")
    
    return model

def best_hyperparam(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=5, scoring_fit='f1_score',
                       do_probabilities = False):
    gs = RandomizedSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data, epochs=100)
    
    if do_probabilities:
      pred = fitted_model.predict_proba(X_test_data)
    else:
      pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [30]:
df_nlp = df_nlp[~df_nlp.description.isna()]
df_nlp

Unnamed: 0,name,description,class
1,PLATINUM CARGO LOGISTICS,platinum cargo logistics['intelligent solution...,fulfillment
2,"Tazmanian Freight Forwarding, Inc.","tazmanian freight forwarding, inc.[""when your ...",fulfillment
3,Mesca (Ascent Global Logistics),mesca (ascent global logistics)['roadrunner tr...,fulfillment
4,GlobalTranz,"globaltranz['seamless warehouse, fulfillment a...",fulfillment
5,A.P. MÃ¸ller - MÃ¦rsk A/S,"a.p. mã¸ller - mã¦rsk a/s[""with shopify, you c...",fulfillment
...,...,...,...
68,Tarkett USA Inc,"tarkett usa inc['', 'see available quickship s...",other
786,"Deutsche Telekom, Inc.","deutsche telekom, inc.['life is for sharing. c...",other
888,"Niagara Bottling, LLC","niagara bottling, llc['utilizing state-of-the-...",other
980,Xerox Corporation,"xerox corporation[""*company separation\r\n\t\t...",other


In [34]:
df_nlp= pd.read_csv('TI_nlp.csv')
df_nlp

Unnamed: 0.1,Unnamed: 0,name,description,class,tokens
0,1,PLATINUM CARGO LOGISTICS,platinum cargo logistics['intelligent solution...,fulfillment,"['platinum', 'cargo', 'logist', 'intellig', 's..."
1,2,"Tazmanian Freight Forwarding, Inc.","tazmanian freight forwarding, inc.[""when your ...",fulfillment,"['tazmanian', 'freight', 'forward', 'ship', 'n..."
2,3,Mesca (Ascent Global Logistics),mesca (ascent global logistics)['roadrunner tr...,fulfillment,"['mesca', 'ascent', 'global', 'logist', 'roadr..."
3,4,GlobalTranz,"globaltranz['seamless warehouse, fulfillment a...",fulfillment,"['globaltranz', 'seamless', 'warehous', 'fulfi..."
4,5,A.P. MÃ¸ller - MÃ¦rsk A/S,"a.p. mã¸ller - mã¦rsk a/s[""with shopify, you c...",fulfillment,"['mã', 'ller', 'mã', 'rsk', 'shopifi', 'use', ..."
...,...,...,...,...,...
1697,1215,"Applied Industrial Technologies, Inc.","applied industrial technologies, inc.['have an...",other,"['appli', 'industri', 'technolog', 'account', ..."
1698,711,TERUMO CORPORATION,terumo corporation['terumo\'s response to covi...,other,"['terumo', 'corpor', 'terumo', 'respons', 'cov..."
1699,798,Abbott Laboratories,abbott laboratories['investors newsroom respon...,other,"['abbott', 'laboratori', 'investor', 'newsroom..."
1700,576,Emerson Electric Co.,"emerson electric co.[""product(s) in your cart ...",other,"['emerson', 'electr', 'product', 'cart', 'avai..."


In [35]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\10inm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier

In [37]:
df_nlp['tokens'] = df_nlp['description'].map(preprocess)

In [38]:
df_nlp.head()
df_nlp['class'].value_counts()

fulfillment    1002
other           700
Name: class, dtype: int64

In [39]:
X = df_nlp['tokens']
y= df_nlp['class']

In [40]:
# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=53, test_size=0.25, shuffle=True, stratify=y )

In [41]:
y_test.head()

797     fulfillment
865     fulfillment
309     fulfillment
807     fulfillment
1676          other
Name: class, dtype: object

In [42]:
# Initialize count vectorizer
count_vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=0.05)

# Create count train and test variables
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

# Initialize tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=0.05)

# Create tfidf train and test variables
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [43]:
print(count_train.shape, tfidf_train.shape)

(1276, 965) (1276, 965)


In [44]:
print(count_vectorizer.vocabulary_)

{'intern': 449, 'transport': 900, 'air': 30, 'freight': 360, 'ocean': 582, 'custom': 207, 'brokerag': 95, 'servic': 788, 'warehous': 942, 'distribut': 242, 'shipment': 796, 'good': 379, 'usa': 920, 'copyright': 191, 'public': 684, 'compani': 160, 'home': 408, 'continu': 184, 'click': 142, 'box': 88, 'let': 488, 'know': 469, 'robot': 750, 'sure': 864, 'browser': 96, 'support': 863, 'javascript': 459, 'cooki': 189, 'block': 83, 'load': 503, 'inform': 436, 'review': 745, 'term': 877, 'polici': 638, 'inquiri': 440, 'relat': 721, 'messag': 541, 'contact': 181, 'team': 871, 'provid': 683, 'refer': 714, 'id': 415, 'global': 376, 'site': 808, 'world': 957, 'healthi': 398, 'categori': 115, 'best': 79, 'american': 34, 'live': 500, 'award': 72, 'hous': 411, 'select': 778, 'highest': 404, 'class': 139, 'sustain': 865, 'launch': 480, 'channel': 128, 'asia': 55, 'pacif': 603, 'indic': 432, 'repres': 730, 'industri': 434, 'leader': 483, 'open': 589, 'arriv': 52, 'high': 402, 'energi': 280, 'brand': 9

In [45]:
crossvalidate_classifier(RandomForestClassifier(), tfidf_train, y_train)

0.99 f-1 score with a standard deviation of 0.00


In [46]:
crossvalidate_classifier(GradientBoostingClassifier(), tfidf_train, y_train)

0.99 f-1 score with a standard deviation of 0.00


References: Analytics Vidya blogs, 