Customer Vendor Mapping

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist
import xgboost as xgb

In [None]:
customer_file = "/content/customer_standard_format.csv"
vendor_file = "/content/vendor_input_format.csv"

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist
import xgboost as xgb

def load_data(customer_file, vendor_file):
    customer_df = pd.read_csv(customer_file, encoding="ISO-8859-1")
    vendor_df = pd.read_csv(vendor_file, encoding="ISO-8859-1")
    return customer_df, vendor_df

def preprocess_data(customer_df, vendor_df):
    customer_df = customer_df.fillna('').copy()
    vendor_df = vendor_df.fillna('').copy()

    customer_df['Business Definition'] = customer_df['Business Definition'].str.lower().str.strip()
    vendor_df['Business Name'] = vendor_df['Business Name'].str.lower().str.strip()

    customer_df['combined_text'] = customer_df['Business Definition']
    vendor_df['combined_text'] = vendor_df['Business Name']

    return customer_df, vendor_df

def generate_embeddings(customer_texts, vendor_texts, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    customer_embeddings = model.encode(customer_texts, convert_to_numpy=True)
    vendor_embeddings = model.encode(vendor_texts, convert_to_numpy=True)
    return customer_embeddings, vendor_embeddings

def extract_features(customer_df, vendor_df, customer_embeddings, vendor_embeddings):
    features = []
    labels = []
    similarity_matrix = 1 - cdist(vendor_embeddings, customer_embeddings, metric='cosine')

    for i, vendor_row in vendor_df.iterrows():
        for j, customer_row in customer_df.iterrows():
            similarity_score = similarity_matrix[i, j]
            features.append([similarity_score, len(vendor_row['combined_text']), len(customer_row['combined_text'])])
            labels.append(1 if similarity_score >= 0.5 else 0)

    return np.array(features), np.array(labels), similarity_matrix

def train_xgboost_model(X_train, y_train):
    model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)
    return model

def compute_contextual_mappings(customer_df, vendor_df, customer_embeddings, vendor_embeddings, similarity_threshold=0.5):
    X_train, y_train, similarity_matrix = extract_features(customer_df, vendor_df, customer_embeddings, vendor_embeddings)
    model = train_xgboost_model(X_train, y_train)

    mapping_data = []

    for i, vendor_row in vendor_df.iterrows():
        best_match = None
        best_score = 0

        for j, customer_row in customer_df.iterrows():
            similarity_score = similarity_matrix[i, j]
            features = np.array([[similarity_score, len(vendor_row['combined_text']), len(customer_row['combined_text'])]])
            prediction = model.predict_proba(features)[0, 1]  # Get probability score

            if prediction > best_score:
                best_score = prediction
                best_match = j

        mapping_entry = {
            **vendor_row.to_dict(),
            'Mapped Customer Field': customer_df.iloc[best_match]['Field Name'] if best_match is not None else None,
            'Mapped Customer Business Definition': customer_df.iloc[best_match]['Business Definition'] if best_match is not None else None,
            'Prediction Score': best_score
        }
        mapping_data.append(mapping_entry)

    return pd.DataFrame(mapping_data)

def automate_customer_mapping(customer_file, vendor_file, output_file='automated_mapped_fields.csv', similarity_threshold=0.5):
    customer_df, vendor_df = load_data(customer_file, vendor_file)
    customer_df, vendor_df = preprocess_data(customer_df, vendor_df)

    customer_embeddings, vendor_embeddings = generate_embeddings(
        customer_df['combined_text'].tolist(),
        vendor_df['combined_text'].tolist()
    )

    mapping_df = compute_contextual_mappings(
        customer_df, vendor_df, customer_embeddings, vendor_embeddings,
        similarity_threshold=similarity_threshold
    )

    mapping_df.to_csv(output_file, index=False)
    print(f"Mapping complete. Results saved as '{output_file}'")
    return mapping_df


In [None]:
customer_file_path = "/content/customer_standard_format.csv"
vendor_file_path = "/content/vendor_input_format.csv"

mapped_results = automate_customer_mapping(customer_file_path, vendor_file_path, similarity_threshold=0.6)



Parameters: { "use_label_encoder" } are not used.



Mapping complete. Results saved as 'automated_mapped_fields.csv'


In [None]:
mapped_results

Unnamed: 0,Field Name,Business Name,Usage,Data Type,Min,Max,Valid Values,combined_text,Mapped Customer Field,Mapped Customer Business Definition,Prediction Score
0,Record ID,record id,R,CHAR,3,3,CLM,record id,RECORDID,the number assigned to the record for identifi...,0.994999
1,Record Number,record number,R,INT,1,10,,record number,RECORDID,the number assigned to the record for identifi...,0.994309
2,Claim ID,claim id,R,VARCHAR,1,50,,claim id,GROUPID,the identifier assigned to the third level of ...,0.999447
3,Original_Claim,original_claim,O,VARCHAR,1,50,,original_claim,PAMCCDE,the type of prior authorization associated wit...,0.99234
4,Group_ID,group_id,R,VARCHAR,1,50,,group_id,CLTPTPST,the code that identifies the final member pay ...,0.00016
5,PCN,pcn,O,VARCHAR,1,10,,pcn,CLTPTPST,the code that identifies the final member pay ...,0.00016
6,Employee_ID,employee_id,O,VARCHAR,1,50,,employee_id,CLTPTPST,the code that identifies the final member pay ...,8.9e-05
7,SSN,ssn,R,CHAR,9,9,,ssn,CLTPTPST,the code that identifies the final member pay ...,0.00016
8,Patient_ID,patient_id,R,VARCHAR,1,30,,patient_id,CARDHOLDER,the identification number assigned to the pers...,0.825512
9,Patient_Last_Name,patient_last_name,R,VARCHAR,1,50,,patient_last_name,PPRSLSTNME,the last name of the member's primary medical ...,0.310133
