In [2]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
%cd '/content/drive/MyDrive/Colab Notebooks/Experiment'
!pwd

/content/drive/MyDrive/Colab Notebooks/Experiment
/content/drive/MyDrive/Colab Notebooks/Experiment


In [None]:
!pip install datasketch

Collecting datasketch
  Using cached datasketch-1.6.5-py3-none-any.whl (89 kB)
Installing collected packages: datasketch
Successfully installed datasketch-1.6.5


In [None]:
import pandas as pd

# Load the datasets
inventory = pd.read_csv('inventory.csv')
product_master = pd.read_csv('product.csv', index_col="id")
knowledge_base = pd.read_csv('knowledge.csv')


In [None]:
import re
def replace_special_characters(text):
    # Standard special characters to be removed
    characters_to_replace = r"[\[\]\(\)\{\}`\-\*\^\"\'\<\>\;]"

    # Replace each character with a space
    cleaned_text = re.sub(characters_to_replace, ' ', text)

    # Replace periods that are not part of numbers with spaces
    cleaned_text = re.sub(r'(?<=\d)\.(?=\d)', 'DOT', cleaned_text)  # Replace decimal points with placeholder
    cleaned_text = re.sub(r'\.', ' ', cleaned_text)  # Replace remaining periods with space

    # Restore decimal points from placeholder
    cleaned_text = re.sub(r'DOT', '.', cleaned_text)

    # Replace 'x' with '@' and '/' with '#', and add spaces around numbers (including decimals)
    cleaned_text = re.sub(r'(\d+(\.\d+)?)([x])(\d+(\.\d+)?)', r' \1@\4 ', cleaned_text)
    cleaned_text = re.sub(r'(\d+(\.\d+)?)([/])(\d+(\.\d+)?)', r' \1#\4 ', cleaned_text)

    # Add space between numbers (including decimals) and immediately following letters
    cleaned_text = re.sub(r'(\d+(\.\d+)?)([A-Za-z])', r' \1 \3', cleaned_text)

    # Add space between letters and immediately following numbers (including decimals)
    cleaned_text = re.sub(r'([A-Za-z])(\d+(\.\d+)?)', r'\1 \2', cleaned_text)

    # Replace '@' with 'x' and '#' with '/'
    cleaned_text = cleaned_text.replace('@', 'x')
    cleaned_text = cleaned_text.replace('#', '/')

    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    return cleaned_text.strip() # Strip leading and trailing spaces


def replace_product_code(code, product):
    result = product
    split_codes = code.split('-')
    sorted_codes_desc = sorted(split_codes, key=len, reverse=True)
    for code in sorted_codes_desc:
        if code in product:
            result = product.replace(code, '')
            break
    return result

def has_bracket(word):
    # REGEX for any bracket
    pattern = r'[()\[\]{}<>]'
    if re.search(pattern, word):
        return True
    return False


In [None]:
knowledge_base.columns = ['inv_raw_pro_name', 'inv_mrp', 'inv_ptr', 'inv_div', 'inv_packing', 'x_pro_name', 'master_pro_name', 'master_id']
knowledge_base['inv_cleaned_pro_name'] = knowledge_base['inv_raw_pro_name'].apply(replace_special_characters)
knowledge_base = pd.merge(knowledge_base, product_master, left_on='master_id', right_on='id', how='left')
knowledge_base.drop("master_pro_name", axis="columns")
knowledge_base.columns = ['inv_raw_pro_name', 'inv_mrp', 'inv_ptr', 'inv_div', 'inv_packing', 'x_pro_name', 'master_pro_name', 'master_id', 'inv_cleaned_pro_name', 'master_name', 'master_manufacturer', 'master_mrp', 'master_pack']

In [None]:
import re
def combine_fields(df):
    return df.apply(lambda row: ' '.join(row.dropna().astype(str).replace('', float('nan')).dropna()), axis=1)

def remove_extra_space(text):
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text.strip()

knowledge_base['exp_combo1'] = combine_fields(knowledge_base[['inv_cleaned_pro_name', 'inv_mrp', 'inv_packing', 'master_name']])
knowledge_base['exp_combo1'] = knowledge_base['exp_combo1'].apply(remove_extra_space)
knowledge_base['exp_combo1_clean'] = knowledge_base['exp_combo1'].apply(replace_special_characters)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from datasketch import MinHash, MinHashLSH

df = knowledge_base[['exp_combo1_clean', 'master_id']].copy()
df.columns = ['combo', 'id']

In [None]:
# Function to generate n-grams
def generate_ngrams(text, n):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    analyzer = vectorizer.build_analyzer()
    return analyzer(text)

# Generate n-grams for each row in the concatenated field
df['trigrams'] = df['combo'].apply(lambda x: generate_ngrams(x, 3))
df['fourgrams'] = df['combo'].apply(lambda x: generate_ngrams(x, 4))
df['fivegrams'] = df['combo'].apply(lambda x: generate_ngrams(x, 5))

# Function to create MinHash object from n-grams
def create_minhash(ngrams):
    m = MinHash(num_perm=256)
    for gram in ngrams:
        m.update(gram.encode('utf8'))
    return m

# Create MinHash objects
df['minhash_tri'] = df['trigrams'].apply(create_minhash)
df['minhash_quad'] = df['fourgrams'].apply(create_minhash)
df['minhash_penta'] = df['fivegrams'].apply(create_minhash)

# Create LSH index
lsh_tri = MinHashLSH(threshold=0.5, num_perm=256)
lsh_quad = MinHashLSH(threshold=0.5, num_perm=256)
lsh_penta = MinHashLSH(threshold=0.5, num_perm=256)

for i, minhash in enumerate(df['minhash_tri']):
    lsh_tri.insert(i, minhash)
for i, minhash in enumerate(df['minhash_quad']):
    lsh_quad.insert(i, minhash)
for i, minhash in enumerate(df['minhash_penta']):
    lsh_penta.insert(i, minhash)

# Querying the LSH index
def query_lsh(lsh, minhash):
    return lsh.query(minhash)

df['lsh_buckets_tri'] = df['minhash_tri'].apply(lambda minhash: query_lsh(lsh_tri, minhash))
df['lsh_buckets_quad'] = df['minhash_quad'].apply(lambda minhash: query_lsh(lsh_quad, minhash))
df['lsh_buckets_penta'] = df['minhash_penta'].apply(lambda minhash: query_lsh(lsh_penta, minhash))

df

Unnamed: 0,combo,id,trigrams,fourgrams,fivegrams,minhash_tri,minhash_quad,minhash_penta,lsh_buckets_tri,lsh_buckets_quad,lsh_buckets_penta
0,A B PHYLLINE SYP 178.0 100 ML AB Phylline Syrup,475,"[phylline syp 178, syp 178 100, 178 100 ml, 10...","[phylline syp 178 100, syp 178 100 ml, 178 100...","[phylline syp 178 100 ml, syp 178 100 ml ab, 1...",<datasketch.minhash.MinHash object at 0x7d2083...,<datasketch.minhash.MinHash object at 0x7d2075...,<datasketch.minhash.MinHash object at 0x7d2072...,"[0, 2, 3, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, ...","[0, 2, 3, 6, 7, 8, 9, 10, 43, 11, 45, 13, 15, ...","[0, 2, 3, 6, 7, 8, 9, 10, 11, 43, 13, 45, 15, ..."
1,AB PHYLLINE SYP 0.0 100 ML AB Phylline Syrup,475,"[ab phylline syp, phylline syp 100, syp 100 ml...","[ab phylline syp 100, phylline syp 100 ml, syp...","[ab phylline syp 100 ml, phylline syp 100 ml a...",<datasketch.minhash.MinHash object at 0x7d2083...,<datasketch.minhash.MinHash object at 0x7d2075...,<datasketch.minhash.MinHash object at 0x7d2072...,"[1, 12, 21, 14]","[1, 12, 14]","[1, 12, 14]"
2,AB PHYLLINE SYP 178.0 100 ML AB Phylline Syrup,475,"[ab phylline syp, phylline syp 178, syp 178 10...","[ab phylline syp 178, phylline syp 178 100, sy...","[ab phylline syp 178 100, phylline syp 178 100...",<datasketch.minhash.MinHash object at 0x7d2083...,<datasketch.minhash.MinHash object at 0x7d2075...,<datasketch.minhash.MinHash object at 0x7d2072...,"[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 16...","[0, 2, 3, 6, 7, 8, 9, 10, 11, 43, 13, 45, 15, ...","[0, 2, 3, 6, 7, 8, 9, 10, 11, 43, 13, 45, 15, ..."
3,AB PHYLLINE SYP 178.0 100 ML AB Phylline Syrup,475,"[ab phylline syp, phylline syp 178, syp 178 10...","[ab phylline syp 178, phylline syp 178 100, sy...","[ab phylline syp 178 100, phylline syp 178 100...",<datasketch.minhash.MinHash object at 0x7d2083...,<datasketch.minhash.MinHash object at 0x7d2075...,<datasketch.minhash.MinHash object at 0x7d2072...,"[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 16...","[0, 2, 3, 6, 7, 8, 9, 10, 11, 43, 13, 45, 15, ...","[0, 2, 3, 6, 7, 8, 9, 10, 11, 43, 13, 45, 15, ..."
4,AB PHYLLINE SYP 195.0 100 ML AB Phylline Syrup,475,"[ab phylline syp, phylline syp 195, syp 195 10...","[ab phylline syp 195, phylline syp 195 100, sy...","[ab phylline syp 195 100, phylline syp 195 100...",<datasketch.minhash.MinHash object at 0x7d2083...,<datasketch.minhash.MinHash object at 0x7d2075...,<datasketch.minhash.MinHash object at 0x7d2072...,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 44, 13, 46, 1...","[4, 5, 44, 23]","[4, 5, 23]"
...,...,...,...,...,...,...,...,...,...,...,...
107111,ZYRTEC TABS 31.5 10 S Zyrtec Tablet,257402,"[zyrtec tabs 31, tabs 31 10, 31 10 zyrtec, 10 ...","[zyrtec tabs 31 10, tabs 31 10 zyrtec, 31 10 z...","[zyrtec tabs 31 10 zyrtec, tabs 31 10 zyrtec t...",<datasketch.minhash.MinHash object at 0x7d2075...,<datasketch.minhash.MinHash object at 0x7d2072...,<datasketch.minhash.MinHash object at 0x7d206f...,"[107104, 107107, 107108, 107110, 107111, 10710...","[107104, 107107, 107108, 107110, 107111, 10710...","[107104, 107107, 107108, 107110, 107111, 10710..."
107112,ZYRTEC TABS 10 TABS 31.5 Zyrtec Tablet,257402,"[zyrtec tabs 10, tabs 10 tabs, 10 tabs 31, tab...","[zyrtec tabs 10 tabs, tabs 10 tabs 31, 10 tabs...","[zyrtec tabs 10 tabs 31, tabs 10 tabs 31 zyrte...",<datasketch.minhash.MinHash object at 0x7d2075...,<datasketch.minhash.MinHash object at 0x7d2072...,<datasketch.minhash.MinHash object at 0x7d206f...,[107112],[107112],[107112]
107113,ZYRTEC TABS 10 S 31.5 Zyrtec Tablet,257402,"[zyrtec tabs 10, tabs 10 31, 10 31 zyrtec, 31 ...","[zyrtec tabs 10 31, tabs 10 31 zyrtec, 10 31 z...","[zyrtec tabs 10 31 zyrtec, tabs 10 31 zyrtec t...",<datasketch.minhash.MinHash object at 0x7d2075...,<datasketch.minhash.MinHash object at 0x7d2072...,<datasketch.minhash.MinHash object at 0x7d206f...,[107113],[107113],[107113]
107114,ZYRTEC TABS 10 MG 31.5 15 Zyrtec Tablet,257402,"[zyrtec tabs 10, tabs 10 mg, 10 mg 31, mg 31 1...","[zyrtec tabs 10 mg, tabs 10 mg 31, 10 mg 31 15...","[zyrtec tabs 10 mg 31, tabs 10 mg 31 15, 10 mg...",<datasketch.minhash.MinHash object at 0x7d2075...,<datasketch.minhash.MinHash object at 0x7d2072...,<datasketch.minhash.MinHash object at 0x7d206f...,[107114],[107114],[107114]


In [None]:
group_map = {}

for index, row in df.iterrows():
    group_id = row['id']
    if group_id not in group_map:
        group_map[group_id] = []
    group_map[group_id].append(index)

def calculate_accuracy(predicted, actual):
    return len(set(predicted).intersection(set(actual))) / len(set(actual))

# Iterate over the DataFrame again to calculate accuracy
accuracy_tri = []
accuracy_quad = []
accuracy_penta = []

for index, row in df.iterrows():
    group_id = row['id']
    group_indices = group_map[group_id]
    accuracy_tri.append(calculate_accuracy(row['lsh_buckets_tri'], group_indices))
    accuracy_quad.append(calculate_accuracy(row['lsh_buckets_quad'], group_indices))
    accuracy_penta.append(calculate_accuracy(row['lsh_buckets_penta'], group_indices))

# Add accuracies as a new column in the DataFrame
df['accuracy_tri'] = accuracy_tri
df['accuracy_quad'] = accuracy_quad
df['accuracy_penta'] = accuracy_penta

In [None]:
summary_stats_tri = df.groupby('id')['accuracy_tri'].describe()
summary_stats_quad = df.groupby('id')['accuracy_quad'].describe()
summary_stats_penta = df.groupby('id')['accuracy_penta'].describe()

In [None]:
summary_stats_tri

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
15,109.0,0.218921,0.121983,0.009174,0.110092,0.275229,0.339450,0.348624
19,111.0,0.071991,0.045736,0.009009,0.045045,0.063063,0.126126,0.144144
20,73.0,0.161569,0.099935,0.013699,0.082192,0.136986,0.287671,0.315068
22,51.0,0.394848,0.228160,0.019608,0.176471,0.549020,0.588235,0.627451
23,47.0,0.143504,0.113062,0.021277,0.063830,0.085106,0.276596,0.319149
...,...,...,...,...,...,...,...,...
257402,48.0,0.118924,0.078888,0.020833,0.041667,0.125000,0.166667,0.229167
257417,35.0,0.337143,0.187842,0.028571,0.185714,0.400000,0.514286,0.514286
257513,52.0,0.346893,0.172172,0.038462,0.250000,0.365385,0.480769,0.711538
257741,51.0,0.076509,0.044584,0.019608,0.039216,0.078431,0.098039,0.176471
