<a href="https://colab.research.google.com/github/EvgeniaKantor/DI-Bootcamp/blob/main/Week12/Product_type.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import google.generativeai as genai
from google.colab import userdata
import os

# Load the dataset
df = pd.read_excel('df_combined.xlsx')

# Used to securely store your API key
GOOGLE_API_KEY = userdata.get('gemini_key')

# Configure the generative AI client
genai.configure(api_key=GOOGLE_API_KEY)

# Assign the API key to an environment variable
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY

# Load the generative model
model = genai.GenerativeModel('gemini-pro')

# Define the chunk size
chunk_size = 10

# Split the 'product_title' column into chunks of 10 titles each
title_chunks = np.array_split(df['product_title'], len(df) // chunk_size + 1)

# List of standardized product types
product_types_list = [
    "Acne Patches", "Acne Treatment", "After Shave", "Anti Chafe", "Anti-aging Cream",
    "Anti-aging Mask", "Anti-aging Serum", "Beauty Tools", "Body Cream", "Body Lotion",
    "Body Powder", "Body Scrub", "Body Soap", "Body Wash", "Collagen Powder",
    "Exfoliating Mask", "Eye Cream", "Eye Gel", "Eye Mask", "Eye Patches", "Face Cleanser", "Face Cream",
    "Face Exfoliator", "Face Foam", "Face Mask", "Face Mist", "Face Moisturizer",
    "Face Oil", "Face Patches", "Face Scrub", "Face Serum", "Face Soap", "Face Sunscreen",
    "Face Toner", "Face Wash", "Face Wipes", "Hair Care", "Hair Removal", "Hand Cream",
    "Hand Soap", "Head Band", "Kids Care", "Lip Balm", "Lip Mask", "Lip Oil",
    "Makeup Remover", "Neck Cream", "Sheet Mask", "Skincare Set",
    "Sunscreen Cream", "Sunscreen Spray", "Tanning Lotion"
]

# Initialize an empty list to store DataFrames
dfs = []

# Function to clean and standardize product titles
def clean_title(title):
    return title.strip().lower().replace('&amp;', '&').replace(',', '').replace('|', '')

# Iterate over title chunks and generate DataFrames
for i, titles_chunk in enumerate(title_chunks):
    # Clean and standardize titles
    cleaned_titles = titles_chunk.apply(clean_title)

    # Construct the query for the chunk
    query = "From the titles:\n"
    for title in cleaned_titles:
        query += f"{title}\n"
    query += "\nextract product type from each title, use only one of the following types:\n"
    query += ', '.join(product_types_list)
    query += ". The result must be in a dataframe with two columns: 'product_title' and 'product_type'."

    # Generate content using Gemini
    response = model.generate_content(query)

    # Extract the text content from the response
    try:
        response_content = response.candidates[0].content.parts[0].text
    except (IndexError, AttributeError):
        raise ValueError("Unexpected response format from the generative model.")

    # Print the response
    print(f"Response for Chunk {i+1}:\n{response_content}\n")

    # Check if the response content is in table format
    if '| product_title' in response_content:
        # Parse the table into a DataFrame
        response_lines = response_content.split('\n')
        headers = response_lines[0].split('|')[1:-1]  # Extract column headers
        rows = [line.split('|')[1:-1] for line in response_lines[2:] if line.strip()]  # Extract rows

        # Ensure all rows have the same number of columns as headers
        rows = [row[:len(headers)] for row in rows]

        # Create DataFrame
        response_df = pd.DataFrame(rows, columns=headers)

        # Clean column names
        response_df.columns = response_df.columns.str.strip().str.replace('&amp;', '&')

        # Print the content of response_df
        print(f"Content of response_df:\n{response_df}\n")

        # Check if DataFrame is not empty
        if not response_df.empty:
            # Extract only the necessary columns: product_title and product_type
            response_df = response_df[['product_title', 'product_type']]

            # Clean and standardize product titles
            response_df['product_title'] = response_df['product_title'].apply(clean_title)

            # Append the DataFrame to the list
            dfs.append(response_df)
        else:
            print("DataFrame is empty. Skipping.")

# Concatenate DataFrames
final_df_new = pd.concat(dfs, ignore_index=True)


Response for Chunk 1:
```
                                product_title product_type
0  under eye patches - 60 pcs - 24k gold eye mask- puffy eyes & ...  Eye Patches
1  pimple patches for face hydrocolloid acne patches cute star ...  Acne Patches
2                                 sleeping lip mask nourish & ...    Lip Mask
3  glow recipe pha + bha face toner - skin care toner pore mini...  Face Toner
4  under eye patches (30 pairs) gold eye mask and hyaluronic a...  Eye Patches
5  bum cream moisturizing body cream body skin repair bum crea...  Body Cream
6  good molecules hyaluronic acid serum - hydrating non-greasy f...  Face Serum
7  bubble skincare slam dunk face moisturizer - hydrating face...  Face Moisturizer
8  good molecules yerba mate wake up eye gel - yerba mate hyal...  Eye Gel
9  eos shea better body lotion- coconut waters 24-hour moistur...  Body Lotion
```

Response for Chunk 2:
| product_title | product_type |
| - | - |
| tree hut vitamin c shea sugar scrub 18 oz ultra h

In [None]:
# Clean and standardize the 'product_title' column in df
df['product_title'] = df['product_title'].apply(clean_title)

# Merge final_df_new with df based on 'product_title'
df = df.merge(final_df_new[['product_title', 'product_type']], on='product_title', how='left')

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369 entries, 0 to 368
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_title               369 non-null    object 
 1   sales_volume_in_past_month  369 non-null    int64  
 2   product_star_rating         369 non-null    float64
 3   product_num_ratings         369 non-null    int64  
 4   product_price               369 non-null    float64
 5   item_form                   369 non-null    object 
 6   special_item_form           369 non-null    object 
 7   product_type_x              1 non-null      object 
 8   product_type_y              251 non-null    object 
dtypes: float64(2), int64(2), object(5)
memory usage: 26.1+ KB


In [None]:
# Drop the 'product_type_x' column
df.drop(columns=['product_type_x'], inplace=True)

# Rename 'product_type_y' to 'product_type'
df.rename(columns={'product_type_y': 'product_type'}, inplace=True)

# Print information about the DataFrame
df.info()

# Save the changes to the DataFrame
df.to_excel('df_with_product_type.xlsx', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369 entries, 0 to 368
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_title               369 non-null    object 
 1   sales_volume_in_past_month  369 non-null    int64  
 2   product_star_rating         369 non-null    float64
 3   product_num_ratings         369 non-null    int64  
 4   product_price               369 non-null    float64
 5   item_form                   369 non-null    object 
 6   special_item_form           369 non-null    object 
 7   product_type                251 non-null    object 
dtypes: float64(2), int64(2), object(4)
memory usage: 23.2+ KB


In [None]:
import time
from google.api_core.exceptions import TooManyRequests

# Define a function to fill empty cells in 'product_type'
def fill_empty_cells(df):
    # Maximum number of retries
    max_retries = 3
    retry_count = 0

    # Check if there are any empty cells in 'product_type'
    while df['product_type'].isnull().any() and retry_count < max_retries:
        # Get indices of empty cells
        empty_indices = df[df['product_type'].isnull()].index

        # Iterate over empty indices
        for index in empty_indices:
            # Extract the product title from df based on the index
            product_title = df.loc[index, 'product_title']

            # Construct the query for the product title
            query = f"From the title:\n{product_title}\nextract product type, use only one of the following types:\n"
            query += ', '.join(product_types_list)
            query += ". The result must be a single product type."

            # Generate content using Gemini
            try:
                response = model.generate_content(query)
            except TooManyRequests:
                print("Rate limit exceeded. Waiting before retrying...")
                time.sleep(60)  # Wait for 60 seconds before retrying
                continue

            # Extract the product type from the response
            try:
                product_type = response.candidates[0].content.parts[0].text.strip()
            except (IndexError, AttributeError):
                print("Unexpected response format from the generative model.")
                continue

            # Update the dataframe with the generated product type
            df.loc[index, 'product_type'] = product_type

            print(f"Filled empty cell at index {index} with product type: {product_type}")

        # Increment retry count
        retry_count += 1

    return df

# Fill empty cells in 'product_type' of df with retry mechanism
final_df_filled = fill_empty_cells(df)

# Save the changes to the dataframe
final_df_filled.to_excel('final_df_filled.xlsx', index=False)


Filled empty cell at index 27 with product type: Face Toner
Filled empty cell at index 29 with product type: Face Oil
Filled empty cell at index 30 with product type: Head Band
Filled empty cell at index 41 with product type: Face Moisturizer
Filled empty cell at index 42 with product type: Eye Mask
Filled empty cell at index 43 with product type: Face Cleanser
Filled empty cell at index 44 with product type: Face Serum
Filled empty cell at index 45 with product type: Face Cream
Filled empty cell at index 46 with product type: Face Serum
Filled empty cell at index 47 with product type: Skincare Set
Filled empty cell at index 48 with product type: Skincare Set
Filled empty cell at index 49 with product type: Anti-aging Serum
Filled empty cell at index 50 with product type: Face Cream
Filled empty cell at index 82 with product type: Skincare Set
Filled empty cell at index 84 with product type: Face Cleanser
Filled empty cell at index 86 with product type: Body Cream




Rate limit exceeded. Waiting before retrying...
Filled empty cell at index 88 with product type: Acne Patches
Filled empty cell at index 89 with product type: Face Mask
Filled empty cell at index 90 with product type: Face Cleanser
Filled empty cell at index 91 with product type: Face Foam
Filled empty cell at index 96 with product type: Beauty Tools
Filled empty cell at index 102 with product type: Eye Cream
Filled empty cell at index 103 with product type: Face Cleanser
Filled empty cell at index 104 with product type: Skincare Set
Filled empty cell at index 105 with product type: Face Wash
Filled empty cell at index 106 with product type: Anti-aging Serum
Filled empty cell at index 107 with product type: Exfoliating Mask
Filled empty cell at index 108 with product type: Face Serum
Filled empty cell at index 109 with product type: Face Moisturizer
Filled empty cell at index 110 with product type: Face Exfoliator
Filled empty cell at index 111 with product type: Anti-aging Serum




Rate limit exceeded. Waiting before retrying...
Filled empty cell at index 113 with product type: Lip Mask
Filled empty cell at index 114 with product type: Face Moisturizer
Filled empty cell at index 115 with product type: Face Moisturizer
Filled empty cell at index 116 with product type: Skincare Set
Filled empty cell at index 117 with product type: Face Scrub
Filled empty cell at index 118 with product type: Eye Patches
Filled empty cell at index 119 with product type: Sheet Mask
Filled empty cell at index 120 with product type: Eye Mask
Filled empty cell at index 121 with product type: Acne Treatment
Filled empty cell at index 162 with product type: Face Wash
Filled empty cell at index 163 with product type: Face Mask
Filled empty cell at index 164 with product type: Face Serum
Filled empty cell at index 165 with product type: Kids Care
Filled empty cell at index 166 with product type: Face Foam
Filled empty cell at index 167 with product type: Face Moisturizer
Filled empty cell at



Rate limit exceeded. Waiting before retrying...
Filled empty cell at index 207 with product type: Lip Mask
Filled empty cell at index 208 with product type: Face Mask
Filled empty cell at index 209 with product type: Face Patches
Filled empty cell at index 210 with product type: Eye Mask
Filled empty cell at index 211 with product type: Skincare Set
Filled empty cell at index 242 with product type: Face Wash
Filled empty cell at index 243 with product type: Face Serum
Filled empty cell at index 244 with product type: Face Serum
Filled empty cell at index 245 with product type: Face Toner
Filled empty cell at index 246 with product type: Anti-aging Serum
Filled empty cell at index 247 with product type: Face Wash
Filled empty cell at index 248 with product type: Face Cream
Filled empty cell at index 249 with product type: Face Mask
Filled empty cell at index 250 with product type: Face Exfoliator
Filled empty cell at index 251 with product type: Face Mask




Rate limit exceeded. Waiting before retrying...
Filled empty cell at index 283 with product type: Skincare Set
Filled empty cell at index 284 with product type: Face Wash
Filled empty cell at index 285 with product type: Face Moisturizer
Filled empty cell at index 286 with product type: Hair Removal
Filled empty cell at index 287 with product type: Sheet Mask
Filled empty cell at index 288 with product type: Face Moisturizer
Filled empty cell at index 289 with product type: Eye Cream
Filled empty cell at index 290 with product type: Face Moisturizer
Filled empty cell at index 291 with product type: Face Moisturizer
Filled empty cell at index 292 with product type: Scar Cream
Filled empty cell at index 293 with product type: Exfoliating Mask
Filled empty cell at index 294 with product type: Body Wash
Filled empty cell at index 295 with product type: Skincare Set
Filled empty cell at index 296 with product type: Anti-aging Cream
Filled empty cell at index 297 with product type: Acne Trea



Rate limit exceeded. Waiting before retrying...
Filled empty cell at index 282 with product type: Face Moisturizer


In [None]:
final_df_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369 entries, 0 to 368
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_title               369 non-null    object 
 1   sales_volume_in_past_month  369 non-null    int64  
 2   product_star_rating         369 non-null    float64
 3   product_num_ratings         369 non-null    int64  
 4   product_price               369 non-null    float64
 5   item_form                   369 non-null    object 
 6   special_item_form           369 non-null    object 
 7   product_type                369 non-null    object 
dtypes: float64(2), int64(2), object(4)
memory usage: 23.2+ KB


In [None]:
# Check if all values in 'product_type' are in product_types_list
all_values_matched = final_df_filled['product_type'].isin(product_types_list).all()

if all_values_matched:
    print("All values in 'product_type' match the values in product_types_list.")
else:
    print("Some values in 'product_type' do not match the values in product_types_list.")


Some values in 'product_type' do not match the values in product_types_list.


In [None]:
# Find values in 'product_type' that are not in product_types_list
invalid_values = final_df_filled[~final_df_filled['product_type'].isin(product_types_list)]['product_type'].unique()

print("Invalid values in 'product_type':")
print(invalid_values)

Invalid values in 'product_type':
[' Body Scrub ' ' Beauty Tools ' ' Face Mist ' ' Face Cream '
 ' Acne Patches ' ' Lip Oil ' ' Face Wash ' ' Face Serum ' ' Body Cream '
 ' Makeup Remover ' ' Skincare Set ' ' Anti-aging Cream ' ' Lip Mask '
 ' Face Moisturizer ' ' Body Lotion ' ' Face Toner ' ' Face Cleanser '
 ' Anti-aging Serum ' ' Face Sunscreen ' ' Body Soap ' ' Sunscreen Cream '
 ' Face Oil ' ' Face Mask ' ' Eye Patches ' ' Skin Care Set '
 ' Eye Cream ' ' Acne Treatment ' ' Face Exfoliator ' ' Collagen Powder '
 ' Sheet Mask ' ' Kids Care ' ' Face Wipes ' ' Hair Care ' ' After Shave '
 ' Hand Cream ' ' Body Wash ' 'Scar Cream' ' Body Powder '
 ' Tanning Lotion ' ' Anti Chafe ' ' Lip Balm ' ' Sunscreen Spray '
 ' Hand Soap ' ' Face Foam ' ' Body Moisturizer ' ' Sunscreen Lotion ']


In [None]:
# Clean and standardize the 'product_type' column
final_df_filled['product_type'] = final_df_filled['product_type'].str.strip().replace({
    'Skin Care Set': 'Skincare Set',  # Correcting typo
    'Scar Cream': 'Acne Treatment',    # Assuming 'Scar Cream' refers to a type of acne treatment
    'Body Moisturizer': 'Body Lotion', # Assuming 'Body Moisturizer' refers to a type of body lotion
    'Sunscreen Lotion': 'Sunscreen Cream' # Correcting typo
})

# Check if all values in 'product_type' are in product_types_list
all_values_matched = final_df_filled['product_type'].isin(product_types_list).all()

if all_values_matched:
    print("All values in 'product_type' match the values in product_types_list.")
else:
    print("Some values in 'product_type' do not match the values in product_types_list.")

# Find values in 'product_type' that are not in product_types_list
invalid_values = final_df_filled[~final_df_filled['product_type'].isin(product_types_list)]['product_type'].unique()

print("\nInvalid values in 'product_type' after cleaning:")
print(invalid_values)

All values in 'product_type' match the values in product_types_list.

Invalid values in 'product_type' after cleaning:
[]


In [None]:
# Merge final_df_filled with df based on 'product_title'
df = df.merge(final_df_filled[['product_title', 'product_type']], on='product_title', how='left')

# Display the first few rows of the merged DataFrame
print(df.head())


                                       product_title  \
0  under eye patches - 60 pcs - 24k gold eye mask...   
1  pimple patches for face hydrocolloid acne patc...   
2  sleeping lip mask nourish & hydrate lip mask w...   
3  glow recipe pha + bha face toner - skin care t...   
4  under eye patches (30 pairs) gold eye mask and...   

   sales_volume_in_past_month  product_star_rating  product_num_ratings  \
0                       20000                  4.2                 8653   
1                       10000                  4.4                 2053   
2                        1000                  4.3                  205   
3                        4000                  4.6                 6075   
4                       30000                  4.5                 3662   

   product_price item_form special_item_form product_type_x product_type_y  
0           8.99       oil           eye oil    Eye Patches    Eye Patches  
1           9.99       oil          face oil   Acne Patche

In [None]:
# Print the columns of final_df_filled
print("Columns of final_df_filled:")
print(final_df_filled.columns)

# Print the first few rows of final_df_filled
print("\nFirst few rows of final_df_filled:")
print(final_df_filled.head())

Columns of final_df_filled:
Index(['product_title', 'sales_volume_in_past_month', 'product_star_rating',
       'product_num_ratings', 'product_price', 'item_form',
       'special_item_form', 'product_type'],
      dtype='object')

First few rows of final_df_filled:
                                       product_title  \
0  under eye patches - 60 pcs - 24k gold eye mask...   
1  pimple patches for face hydrocolloid acne patc...   
2  sleeping lip mask nourish & hydrate lip mask w...   
3  glow recipe pha + bha face toner - skin care t...   
4  under eye patches (30 pairs) gold eye mask and...   

   sales_volume_in_past_month  product_star_rating  product_num_ratings  \
0                       20000                  4.2                 8653   
1                       10000                  4.4                 2053   
2                        1000                  4.3                  205   
3                        4000                  4.6                 6075   
4                   

In [None]:
# Check the columns of df
print("Columns of df:")
print(df.columns)

# Check the first few rows of df
print("\nFirst few rows of df:")
print(df.head())

Columns of df:
Index(['product_title', 'sales_volume_in_past_month', 'product_star_rating',
       'product_num_ratings', 'product_price', 'item_form',
       'special_item_form', 'product_type_x', 'product_type_y'],
      dtype='object')

First few rows of df:
                                       product_title  \
0  under eye patches - 60 pcs - 24k gold eye mask...   
1  pimple patches for face hydrocolloid acne patc...   
2  sleeping lip mask nourish & hydrate lip mask w...   
3  glow recipe pha + bha face toner - skin care t...   
4  under eye patches (30 pairs) gold eye mask and...   

   sales_volume_in_past_month  product_star_rating  product_num_ratings  \
0                       20000                  4.2                 8653   
1                       10000                  4.4                 2053   
2                        1000                  4.3                  205   
3                        4000                  4.6                 6075   
4                       30

In [None]:
# Merge final_df_filled with df based on 'product_title'
df = df.merge(final_df_filled[['product_title', 'product_type']], on='product_title', how='left')

# Display the first few rows of the merged DataFrame
print(df.head())


                                       product_title  \
0  under eye patches - 60 pcs - 24k gold eye mask...   
1  pimple patches for face hydrocolloid acne patc...   
2  sleeping lip mask nourish & hydrate lip mask w...   
3  glow recipe pha + bha face toner - skin care t...   
4  under eye patches (30 pairs) gold eye mask and...   

   sales_volume_in_past_month  product_star_rating  product_num_ratings  \
0                       20000                  4.2                 8653   
1                       10000                  4.4                 2053   
2                        1000                  4.3                  205   
3                        4000                  4.6                 6075   
4                       30000                  4.5                 3662   

   product_price item_form special_item_form product_type_x product_type_y  \
0           8.99       oil           eye oil    Eye Patches    Eye Patches   
1           9.99       oil          face oil   Acne Patc

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 567 entries, 0 to 566
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_title               567 non-null    object 
 1   sales_volume_in_past_month  567 non-null    int64  
 2   product_star_rating         567 non-null    float64
 3   product_num_ratings         567 non-null    int64  
 4   product_price               567 non-null    float64
 5   item_form                   567 non-null    object 
 6   special_item_form           567 non-null    object 
 7   product_type_x              567 non-null    object 
 8   product_type_y              567 non-null    object 
 9   product_type                567 non-null    object 
dtypes: float64(2), int64(2), object(6)
memory usage: 44.4+ KB


In [None]:
# Drop duplicates based on 'product_title'
df = df.drop_duplicates(subset='product_title', keep='first')

# Drop unnecessary columns 'product_type_y' and 'product_type_x'
df = df.drop(columns=['product_type_y', 'product_type_x'])

df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 357 entries, 0 to 566
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_title               357 non-null    object 
 1   sales_volume_in_past_month  357 non-null    int64  
 2   product_star_rating         357 non-null    float64
 3   product_num_ratings         357 non-null    int64  
 4   product_price               357 non-null    float64
 5   item_form                   357 non-null    object 
 6   special_item_form           357 non-null    object 
 7   product_type                357 non-null    object 
dtypes: float64(2), int64(2), object(4)
memory usage: 25.1+ KB


In [None]:
# Save the changes to the dataframe
df.to_excel('final_df_filled.xlsx', index=False)