<center><h1 style="color: #555555;">Experience Script Documentation</h1></center>

<div align="center">
    <table style="width: 80%; border-collapse: collapse;">
        <tr>
            <th style="background-color: #6AAFE6; color: #ffffff; padding: 10px;">Context</th>
            <th style="background-color: #6AAFE6; color: #ffffff; padding: 10px;">Approach</th>
            <th style="background-color: #6AAFE6; color: #ffffff; padding: 10px;">Value Created</th>
        </tr>
        <tr>
            <td style="background-color: #E8F4FC; padding: 10px;">Provide the background and purpose of the script.</td>
            <td style="background-color: #E8F4FC; padding: 10px;">Describe the libraries, methods, and thought process.</td>
            <td style="background-color: #E8F4FC; padding: 10px;">Highlight the outcomes, improvements, and conclusions.</td>
        </tr>
    </table>
</div>
</html>

In [None]:
import pandas as pd

dtype = {
    'SUPPLIER_ERP': 'str', 
    'SUPPLIER_NORMALIZED': 'str',
}

#invoice_data = pd.read_csv('sap supplier list apr-jun23.csv', encoding='UTF-8-SIG', dtype=dtype)
df = pd.read_csv('hpo supplier list dec23 v.1.csv', encoding='UTF-8-SIG', dtype=dtype)

#Use .loc to strip whitespaces in 'SUPPLIER_ERP' and 'SUPPLIER_NORMALIZED'
df.loc[:, 'SUPPLIER_ERP'] = df['SUPPLIER_ERP'].str.strip()
df.loc[:, 'SUPPLIER_NORMALIZED'] = df['SUPPLIER_NORMALIZED'].str.strip()

df = df.dropna(subset=['SUPPLIER_ERP', 'SUPPLIER_NORMALIZED'], how='any', inplace=False)



In [None]:
import pandas as pd
import numpy as np
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Custom TF-IDF vectorizer that boosts the TF-IDF scores of the words of interest


class CustomTfidfVectorizer(TfidfVectorizer):
    def __init__(self, keyword_list, boost_val, 
                 input='content', encoding='utf-8', decode_error='strict', 
                 strip_accents=None, lowercase=False, preprocessor=None, 
                 tokenizer=None, analyzer='word', stop_words=None, 
                 token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), 
                 max_df=1.0, min_df=1, max_features=None, vocabulary=None, 
                 binary=False, dtype=np.int64, norm='l2', use_idf=True, 
                 smooth_idf=True, sublinear_tf=False):

        # Initialize our added parameters
        self.keyword_list = keyword_list
        self.boost_val = boost_val

        # Initialize TfidfVectorizer with passed parameters
        super().__init__(input=input, encoding=encoding, decode_error=decode_error, 
                         strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, 
                         tokenizer=tokenizer, analyzer=analyzer, stop_words=stop_words, 
                         token_pattern=token_pattern, ngram_range=ngram_range, 
                         max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, 
                         binary=binary, dtype=dtype, norm=norm, use_idf=use_idf, 
                         smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)

def transform(self, raw_documents, copy=True):
    X = super(CustomTfidfVectorizer, self).transform(raw_documents, copy=copy)
    for word in self.keyword_list:
        if word in self.vocabulary_:
            X[:, self.vocabulary_[word]] *= self.boost_val
    return X


In [None]:
# Keywords to emphasize clustering
keywords = ["AMAZON", "BJS WHOLESALE CLUB","BRUEGGERS", "DUNKIN DONUTS", "MICROSOFT", "SAMS CLUB", "TWILIO", "UBER"
            ]

In [None]:
# Create a custom TF-IDF vectorizer
vectorizer = CustomTfidfVectorizer(keyword_list=keywords, boost_val=8.0, stop_words="english")

# Fit the pipeline on the company names and compute the tfidf_matrix
tfidf_matrix = vectorizer.fit_transform(df ["SUPPLIER_NORMALIZED"])

# Transform the keywords into the TF-IDF space
keywords_tfidf = vectorizer.transform(keywords)

# Create a KMeans clustering model.
n_clusters = 80  # Adjust this as per your requirements.
kmeans = KMeans(n_clusters=n_clusters)

# Cluster the data
kmeans.fit(tfidf_matrix)

# Get cluster labels
labels = kmeans.labels_

# Add the cluster labels back to the main dataframe
df ['label'] = labels

# Store cosine similarities to the centroid of each cluster
similarities_to_centroid = []
centroids = kmeans.cluster_centers_

for i, label in enumerate(labels):                                                                                                  
    similarity = cosine_similarity(tfidf_matrix[i].reshape(1, -1), centroids[label].reshape(1, -1))[0][0]
    similarities_to_centroid.append(similarity)

df ['similarity'] = similarities_to_centroid

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
# Existing KMeans clustering part
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(tfidf_matrix)
                                      
# Inertia        
total_inertia = kmeans.inertia_
print(f"Total Inertia: {total_inertia}")

  super()._check_params_vs_input(X, default_n_init=10)


Total Inertia: 28.22430205932443


In [None]:
# Find the representative name for each cluster centroid
representative_names = []
for center in centroids:
    # Check if centroid is close to a keyword vector
    keyword_similarities = cosine_similarity(center.reshape(1, -1), keywords_tfidf)
    max_keyword_similarity_index = keyword_similarities.argmax()
    
    # If the max similarity is above a certain threshold, use the keyword
    if keyword_similarities[0, max_keyword_similarity_index] > 0.75:  # Adjust the threshold as needed
        representative_name = keywords[max_keyword_similarity_index]
    else:
        # Find the closest supplier to the centroid
        supplier_similarities = cosine_similarity(center.reshape(1, -1), tfidf_matrix)
        representative_index = supplier_similarities.argmax()
        representative_name = df .iloc[representative_index]["SUPPLIER_NORMALIZED"]
    
    representative_names.append(representative_name)

In [None]:
for center in centroids:
    # Check if centroid is close to a keyword vector
    keyword_similarities = cosine_similarity(center.reshape(1, -1), keywords_tfidf)
    max_keyword_similarity_index = keyword_similarities.argmax()
    
    # If the max similarity is above a certain threshold, use the keyword
    if keyword_similarities[0, max_keyword_similarity_index] > 0.75:  # Adjust the threshold as needed
        representative_name = keywords[max_keyword_similarity_index]
    else:
        # Find the closest supplier to the centroid
        supplier_similarities = cosine_similarity(center.reshape(1, -1), tfidf_matrix)
        representative_index = supplier_similarities.argmax()
        representative_name = df .iloc[representative_index]["SUPPLIER_NORMALIZED"]
    
    representative_names.append(representative_name)
# Map cluster labels to the representative names
cluster_to_name_map = {i: name for i, name in enumerate(representative_names)}

# Add a new column to your dataframe for the representative names of each cluster
df ['normalized_name'] = df ['label'].map(cluster_to_name_map)

In [None]:
# Export the resulting table
df[['label','FILE_NAME', 'SOURCE_DATA', 'SUPPLIER_ERP', 'SUPPLIER_NORMALIZED', 'normalized_name', 'SPEND_USD', 'similarity']].to_csv('hpo_clustered_suppliers_dec v.1.csv', index=False)