### Parse Data

In [167]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Item number to compare
item_number_to_compare = 10090846

# Extracting the form/size of the specified item
specified_item_form_size = data_lidocaine[data_lidocaine['Item Number – 8 digit'] == item_number_to_compare]['Form/Size'].iloc[0]

# Preparing the dataset for similarity comparison
comparison_data = data_lidocaine[['Item Number – 8 digit', 'Form/Size']].copy()

# Using TF-IDF Vectorizer to convert text data to vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(comparison_data['Form/Size'].fillna(""))

# Finding the vector for the specified item
specified_item_vector = tfidf_matrix[comparison_data['Item Number – 8 digit'] == item_number_to_compare]

# Calculating cosine similarity with other items
cosine_similarities = cosine_similarity(specified_item_vector, tfidf_matrix).flatten()

# Adding the similarity scores to the comparison data
comparison_data['Similarity'] = cosine_similarities

# Filtering items with similarity of 0.8 or more
similar_items = comparison_data[comparison_data['Similarity'] >= 0.9] # TODO: Idetify better number

similar_items = similar_items[similar_items['Item Number – 8 digit'] != item_number_to_compare]

# Displaying similar items
similar_items


Unnamed: 0,Item Number – 8 digit,Form/Size,Similarity
2632,10098876,TOPICAL CREAM (G) 4,1.0
3416,10103480,TOPICAL CREAM (G) 5,1.0
3580,10104122,HCl TOPICAL CREAM (G,0.922627
3581,10104123,HCl TOPICAL CREAM (G,0.922627
3628,10104403,TOPICAL CREAM (G) 5,1.0
3732,10104982,TOPICAL CREAM (G) 5,1.0
5235,10114467,HCl TOPICAL CREAM (G,0.922627
5256,10114680,HCl TOPICAL CREAM (G,0.922627
7412,10136471,TOPICAL CREAM (G) 4,1.0
14000,10166900,TOPICAL CREAM (G) 5,1.0


In [168]:
# Merging the similar items with the original dataset to get all the details
similar_items_details = pd.merge(similar_items, data_lidocaine, on=['Item Number – 8 digit', 'Form/Size'])

# Displaying the details of similar items
similar_items_details


Unnamed: 0,Item Number – 8 digit,Form/Size,Similarity,NDC Number,Description,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,Contract Flag,WAC Price,Generic Name
0,10098876,TOPICAL CREAM (G) 4,1.0,24357070115,ANECREAM 4% 15 GM,15,1,1920,1093,1640,C,1600,lidocaine
1,10103480,TOPICAL CREAM (G) 5,1.0,496089230,RECTICARE 5% ANORECTAL CRM 30,30,1,2771,2540,3810,N,2309,lidocaine
2,10104122,HCl TOPICAL CREAM (G,0.922627,13925015901,LIDOCAINE HCL 3% CRM 28 GM,28,1,5835,4326,0,N,4685,lidocaine
3,10104123,HCl TOPICAL CREAM (G,0.922627,13925015903,LIDOCAINE HCL 3% CRM 85 GM,3,1,17505,12987,0,N,14065,lidocaine
4,10104403,TOPICAL CREAM (G) 5,1.0,24357070230,ANECREAM 5% 30 GM,30,1,4440,4070,6105,N,3700,lidocaine
5,10104982,TOPICAL CREAM (G) 5,1.0,24357070215,ANECREAM 5% 15 GM,15,1,2400,2200,3300,N,2000,lidocaine
6,10114467,HCl TOPICAL CREAM (G,0.922627,59088099707,LIDOCAINE HCL 3% CRM 85 GM PPR,85,1,12252,4726,0,C,10210,lidocaine
7,10114680,HCl TOPICAL CREAM (G,0.922627,59088099703,LIDOCAINE HCL 3% CRM 28 GM PPR,28,1,5160,1904,0,C,4300,lidocaine
8,10136471,TOPICAL CREAM (G) 4,1.0,24357070106,ANECREAM 4 % TUBE CRM 5X5 GM,5X5,1,2100,1439,2159,C,1750,lidocaine
9,10166900,TOPICAL CREAM (G) 5,1.0,69315030130,RECTASMOOTHE 5% CRM 30 GM,30,1,1922,1074,1611,C,1395,lidocaine


### Step 3
Filter based on description

In [169]:
def calculate_similarity(string1, string2):
    """
    Calculate the cosine similarity between two strings.

    Args:
    string1 (str): The first string.
    string2 (str): The second string.

    Returns:
    float: Cosine similarity between string1 and string2.
    """
    # Create a TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Vectorize the strings
    tfidf_matrix = tfidf_vectorizer.fit_transform([string1, string2])

    # Calculate cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0][0]

    return similarity

In [170]:
def divide_string_at_first_number(s):
    """
    Divide a string based on the appearance of the first number.
    """
    # Find the index of the first digit
    for index, char in enumerate(s):
        if char.isdigit():
            # Return the substring starting from the first digit
            return s[index:]
    
    return ""  # Return empty string if no digit is found

# Example usage
test_string = "LIDOCAINE HCL 3% CRM 85 GM PPR"
result = divide_string_at_first_number(test_string)
print(result)  # Output: 3% CRM 85 GM PPR


3% CRM 85 GM PPR


In [176]:
# Extracting the form/size of the specified item
specified_item_description = data_lidocaine[data_lidocaine['Item Number – 8 digit'] == item_number_to_compare]['Description'].iloc[0]

specified_item_description = divide_string_at_first_number(specified_item_description)

new_data = specified_item_description

for index, row in similar_items_details.iterrows():
    s = divide_string_at_first_number(row['Description'])
    new_data[index]['Similarity'] = calculate_similarity(specified_item_description, s)
    

new_data

TypeError: 'str' object does not support item assignment