### Parse Data

In [207]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV file
file_path = 'Daily Snapshot.csv'
data = pd.read_csv(file_path)

# Removing specified columns and moving 'Item Number – 8 digit' column to the left
data.drop(columns=[
    "Item Number – 6 digit", 
    "UPC Number", 
    "Constant", 
    "Customer-Specific Item Number", 
    "Pack Size Divisor", 
    "RX/OTC Indicator"
], inplace=True)
column_to_move = data.pop("Item Number – 8 digit")
data.insert(0, "Item Number – 8 digit", column_to_move)

# Moving the price and contract flag columns to the right
columns_to_move = data[["AWP Price", "Acquisition Price", "Retail Price", "Contract Flag", "WAC Price"]]
data.drop(columns=["AWP Price", "Acquisition Price", "Retail Price", "Contract Flag", "WAC Price"], inplace=True)
data = pd.concat([data, columns_to_move], axis=1)

# Function to split the generic description based on the first uppercase letter
def split_description(desc):
    match = re.search(r'[A-Z]', desc)
    if match:
        index = match.start()
        return desc[:index].strip(), desc[index:].strip()
    else:
        return desc, ""

# Applying the function to split the 'Generic Description' column
data['Generic Name'], data['Form/Size'] = zip(*data['Generic Description'].apply(split_description))
data.drop(columns=['Generic Description'], inplace=True)

# Creating a new dataset excluding items without a generic name
data_with_generic_name = data[data['Generic Name'].apply(lambda x: x.strip() != "")]

# Creating a new dataset with only the items with the exact generic name "lidocaine"
data_lidocaine = data_with_generic_name[data_with_generic_name['Generic Name'].str.lower() == "lidocaine"]

# Item number to compare
item_number_to_compare = 10188246

# Preparing the dataset for similarity comparison
comparison_data = data_lidocaine[['Item Number – 8 digit', 'Form/Size']].copy()

# Using TF-IDF Vectorizer to convert text data to vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(comparison_data['Form/Size'].fillna(""))

# Finding the vector for the specified item
specified_item_vector = tfidf_matrix[comparison_data['Item Number – 8 digit'] == item_number_to_compare]

# Calculating cosine similarity with other items
cosine_similarities = cosine_similarity(specified_item_vector, tfidf_matrix).flatten()

# Adding the similarity scores to the comparison data
comparison_data['Similarity'] = cosine_similarities

# Filtering items with similarity of 0.8 or more
similar_items = comparison_data[comparison_data['Similarity'] >= 0.9]
similar_items_filtered = similar_items[similar_items['Item Number – 8 digit'] != item_number_to_compare]

similar_items_filtered

Unnamed: 0,Item Number – 8 digit,Form/Size,Similarity
1917,10092805,HCl INJECTION VIAL 2,1.0
2360,10096941,HCl INJECTION VIAL 1,1.0
2832,10100527,HCl INJECTION VIAL 2,1.0
2853,10100726,HCl INJECTION VIAL 2,1.0
2886,10100872,HCl INJECTION VIAL 1,1.0
3433,10014485,HCl INJECTION VIAL 1,1.0
9524,10025368,HCl INJECTION VIAL 5,1.0
9526,10025369,HCl INJECTION VIAL 1,1.0
9529,10025371,HCl INJECTION VIAL 2,1.0
9532,10025374,HCl INJECTION VIAL 2,1.0


In [210]:
# Merging the similar items with the original dataset to get all the information for each item
full_data_similar_items = pd.merge(similar_items_filtered, data_with_generic_name, on='Item Number – 8 digit')

full_data_similar_items  # Displaying first few rows of the merged dataset



Unnamed: 0,Item Number – 8 digit,Form/Size_x,Similarity,NDC Number,Description,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,Contract Flag,WAC Price,Generic Name,Form/Size_y
0,10092805,HCl INJECTION VIAL 2,1.0,63323048657,XYLOCAINE 2% MDV 25X50 ML,25X50,1,21390,16458,0,N,17825,lidocaine,HCl INJECTION VIAL 2
1,10096941,HCl INJECTION VIAL 1,1.0,63323048527,XYLOCAINE 1% MDV 25X20 ML,25X20,1,10170,4155,0,C,8475,lidocaine,HCl INJECTION VIAL 1
2,10100527,HCl INJECTION VIAL 2,1.0,63323048627,XYLOCAINE 2% MDV 25X20 ML,25X20,1,11520,3971,0,C,9600,lidocaine,HCl INJECTION VIAL 2
3,10100726,HCl INJECTION VIAL 2,1.0,63323048617,XYLOCAINE 2% MDV 25X10 ML,25X10,1,10350,4732,0,C,8625,lidocaine,HCl INJECTION VIAL 2
4,10100872,HCl INJECTION VIAL 1,1.0,63323048557,XYLOCAINE 1% VL 25X50 ML,25X50,1,17160,13204,0,N,14300,lidocaine,HCl INJECTION VIAL 1
5,10014485,HCl INJECTION VIAL 1,1.0,409427602,LIDOCAINE HCL 1% MDV 25X50 ML,25X50,1,9546,6935,0,C,7955,lidocaine,HCl INJECTION VIAL 1
6,10025368,HCl INJECTION VIAL 5,1.0,409427501,LIDOCAINE HCL 0.5% MDV 25X50 M,25X50,1,14212,10316,0,C,11843,lidocaine,HCl INJECTION VIAL 5
7,10025369,HCl INJECTION VIAL 1,1.0,409427601,LIDOCAINE HCL 1% MDV 25X20 ML,25X20,1,4757,3347,0,C,3964,lidocaine,HCl INJECTION VIAL 1
8,10025371,HCl INJECTION VIAL 2,1.0,409427701,LIDOCAINE HCL 2% MDV 25X20 ML,25X20,1,7012,4686,0,C,5843,lidocaine,HCl INJECTION VIAL 2
9,10025374,HCl INJECTION VIAL 2,1.0,409427702,LIDOCAINE HCL 2% MDV 25X50 ML,25X50,1,12570,9145,0,C,10475,lidocaine,HCl INJECTION VIAL 2


In [211]:
# Function to split the description based on the first number that appears
def split_description_by_number(desc):
    match = re.search(r'\d', desc)
    if match:
        index = match.start()
        return desc[:index].strip(), desc[index:].strip()
    else:
        return desc, ""

# Applying the function to split the 'Description' column
full_data_similar_items['Description Only'], full_data_similar_items['Size'] = zip(*full_data_similar_items['Description'].apply(split_description_by_number))

full_data_similar_items.head()  # Displaying first few rows of the updated dataset



Unnamed: 0,Item Number – 8 digit,Form/Size_x,Similarity,NDC Number,Description,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,Contract Flag,WAC Price,Generic Name,Form/Size_y,Description Only,Size
0,10092805,HCl INJECTION VIAL 2,1.0,63323048657,XYLOCAINE 2% MDV 25X50 ML,25X50,1,21390,16458,0,N,17825,lidocaine,HCl INJECTION VIAL 2,XYLOCAINE,2% MDV 25X50 ML
1,10096941,HCl INJECTION VIAL 1,1.0,63323048527,XYLOCAINE 1% MDV 25X20 ML,25X20,1,10170,4155,0,C,8475,lidocaine,HCl INJECTION VIAL 1,XYLOCAINE,1% MDV 25X20 ML
2,10100527,HCl INJECTION VIAL 2,1.0,63323048627,XYLOCAINE 2% MDV 25X20 ML,25X20,1,11520,3971,0,C,9600,lidocaine,HCl INJECTION VIAL 2,XYLOCAINE,2% MDV 25X20 ML
3,10100726,HCl INJECTION VIAL 2,1.0,63323048617,XYLOCAINE 2% MDV 25X10 ML,25X10,1,10350,4732,0,C,8625,lidocaine,HCl INJECTION VIAL 2,XYLOCAINE,2% MDV 25X10 ML
4,10100872,HCl INJECTION VIAL 1,1.0,63323048557,XYLOCAINE 1% VL 25X50 ML,25X50,1,17160,13204,0,N,14300,lidocaine,HCl INJECTION VIAL 1,XYLOCAINE,1% VL 25X50 ML


AttributeError: 'DataFrame' object has no attribute 'append'