# Active Code

In [None]:
import pandas as pd
import re

# Load the dataset from the text file
file_path = 'Taha_Output.txt'
with open(file_path, 'r') as file:
    data = file.readlines()

# Clean and structure the data into a DataFrame
data = [line.strip() for line in data]
data = [line for line in data if line]  # Remove empty lines
columns = ['Product']
df = pd.DataFrame(data, columns=columns)

# Define normalization and standardization functions
def normalize_form_factor(product):
    """
    Normalize form factors in product names.
    """
    # Define normalization rules
    rules = {
        'inj': 'injection',
        'conc': 'concentrate',
        'impl': 'implant',
        'drops': 'drops',
        'vial': 'injection vial',
        'implan': 'implant',
        'kit': 'implant kit',
         'caps': 'capsules',
    'tab': 'tablets',
    'caplet': 'tablet',  # Considering caplets and tablets equivalent for this context
    'ointment': 'cream',  # Simplification, though not strictly accurate
    'susp': 'suspension',
    'conc': 'concentrate',
    'inj': 'injection',
    'vial': 'injection',  # Vials are typically used for injections
    'syringe': 'injection',  # Similarly, syringes imply injection
    'powd': 'powder',
    'aero': 'aerosol',
    'drops': 'solution',  # Eye/nose drops are essentially solutions
    'lozenge': 'tablet',  # Simplification, for oral solid dosage
    'shampoo': 'topical solution',  # Shampoos can be considered topical solutions
    'foam': 'topical foam',
    'gel': 'topical gel'
    }
    
    # Apply normalization rules
    for key, value in rules.items():
        if re.search(r'\b' + key + r'\b', product, re.IGNORECASE):
            product = re.sub(r'\b' + key + r'\b', value, product, flags=re.IGNORECASE)
    return product

# Normalize the 'Product' column
df['Normalized_Product'] = df['Product'].apply(normalize_form_factor)

# Identify potential replacements based on form factors
# Assuming the form factor is the last word in the product name after normalization
df['Form_Factor'] = df['Normalized_Product'].apply(lambda x: x.split()[-1])

# Group by 'Form_Factor' to find potential replacements
potential_replacements = df.groupby('Form_Factor')['Product'].apply(list).to_dict()

# Display the results
for form_factor, products in potential_replacements.items():
    print(f"Form Factor: {form_factor}")
    print(f"Example Products: {products[:10]}")  # Display first 3 examples
    print()


In [None]:
import pandas as pd
import re

# Load the dataset from the text file
file_path = 'Taha_Output.txt'
with open(file_path, 'r') as file:
    data = file.readlines()

# Clean and structure the data into a DataFrame
data = [line.strip() for line in data]
data = [line for line in data if line]  # Remove empty lines
columns = ['Product']
df = pd.DataFrame(data, columns=columns)

# Define normalization and standardization functions
def normalize_form_factor(product):
    """
    Normalize form factors in product names.
    """
    # Define normalization rules
    rules = {
        'inj': 'injection',
        'conc': 'concentrate',
        'impl': 'implant',
        'drops': 'drops',
        'vial': 'injection vial',
        'implan': 'implant',
        'kit': 'implant kit',
            'caps': 'capsules',
    'tab': 'tablets',
    'caplet': 'tablet',  # Considering caplets and tablets equivalent for this context
    'ointment': 'cream',  # Simplification, though not strictly accurate
    'susp': 'suspension',
    'conc': 'concentrate',
    'inj': 'injection',
    'vial': 'injection',  # Vials are typically used for injections
    'syringe': 'injection',  # Similarly, syringes imply injection
    'powd': 'powder',
    'aero': 'aerosol',
    'drops': 'solution',  # Eye/nose drops are essentially solutions
    'lozenge': 'tablet',  # Simplification, for oral solid dosage
    'shampoo': 'topical solution',  # Shampoos can be considered topical solutions
    'foam': 'topical foam',
    'gel': 'topical gel'
    }
    
    # Apply normalization rules
    for key, value in rules.items():
        if re.search(r'\b' + key + r'\b', product, re.IGNORECASE):
            product = re.sub(r'\b' + key + r'\b', value, product, flags=re.IGNORECASE)
    return product

df['Normalized_Product'] = df['Product'].apply(normalize_form_factor)

# Define a list of known form factors
known_form_factors = ['injection', 'concentrate', 'implant', 'drops', 'injection vial', 'implant kit', 'tablet', 'cream', 'syrup', 'capsule']

def find_form_factor(product, form_factors):
    """
    Find the form factor in a product name based on known form factors.
    """
    for form_factor in form_factors:
        if form_factor in product:
            return form_factor
    return 'unknown'  # Return 'unknown' if no known form factor is found

# Apply the improved form factor identification method
df['Form_Factor'] = df['Normalized_Product'].apply(lambda x: find_form_factor(x.lower(), known_form_factors))

# Group by 'Form_Factor' to find potential replacements
potential_replacements = df.groupby('Form_Factor')['Product'].apply(list).to_dict()

# Display the results
for form_factor, products in potential_replacements.items():
    print(f"Form Factor: {form_factor}")
    print(f"Example Products: {products[:10]}")  # Display first 3 examples if available
    print()