In [None]:
import pandas as pd
import numpy as np
import os 
import warnings
warnings.filterwarnings('ignore')

from dotenv import load_dotenv
load_dotenv()
FOLDER_PATH = os.getenv("FOLDER_PATH")

if not FOLDER_PATH:
    raise ValueError("FOLDER_PATH not set in .env file!")

In [None]:
df = pd.read_csv(os.path.join(FOLDER_PATH, "data", "products_and_ingredients.csv")).iloc[:, 1:]
print(df.shape)
print(df.columns)
df.head()

In [None]:
def column_summary(df):
    
    summary_df = pd.DataFrame({
        'col_name' : df.columns,
        'col_dtype' : df.dtypes.values,
        'num_of_nulls' : df.isnull().sum().values,
        'null%' : round((df.isnull().sum()/df.shape[0])*100, 2).values,
        'num_of_non_nulls' : df.count().values,
        'num_of_distinct_values' : df.nunique().values
    })
    
    ## extracting distinct_values_counts for summary_df
    distinct_values_counts = []
    for i in range(df.shape[1]):
        if summary_df['num_of_distinct_values'][i]<=10:
            distinct_values_counts.append(dict(zip(df[df.columns[i]].value_counts().index.to_list(),df[df.columns[i]].value_counts().values)))
        else:
            distinct_values_counts.append(dict(zip(df[df.columns[i]].value_counts().index.to_list()[:10],df[df.columns[i]].value_counts().values[:10])))
            
    summary_df['distinct_values_counts'] = distinct_values_counts
    
    return summary_df

In [None]:
column_summary(df)

In [None]:
df.duplicated().sum()

### Data Cleaning :

In [None]:
df['skin_profile'] = (df['skin_type'].astype(str).replace('nan', '') +' concerns: '+ df['concerns'].astype(str).replace('nan', '')).replace(' concerns: ', np.nan)
df.drop(['skin_type', 'concerns'], axis=1, inplace=True)

### Separating Products and Ingredients Data :

In [None]:
df.head()

In [None]:
import re
def clean_text(text):
    if not isinstance(text, str):
        return ""
    return re.sub(r'[^a-z0-9\s]', '', text.lower().strip())

In [None]:
df['prod_descrp'] = df['brand_name'].apply(clean_text) +' '+ df['prod_name'].apply(clean_text)
df.columns

In [None]:
df['list_of_ingreds'] = df['ingreds'].str.split(", ")

In [None]:
ingreds_data = df.explode('list_of_ingreds').rename(columns={'list_of_ingreds': 'ingred_name'})[['ingred_name', 'ratingscore', 'skin_profile']].sample(113224).reset_index(drop=True)

In [None]:
prods_data = df[['brand_name', 'prod_name', 'prod_descrp', 'list_of_ingreds']]

In [None]:
prods_data['list_of_ingreds'] = prods_data['list_of_ingreds'].apply(lambda x: x.replace('[', '').replace(']', ''))
prods_data['list_of_ingreds'] = prods_data['list_of_ingreds'].apply(lambda x: x.replace("'", ""))

In [None]:
ingreds_data.to_csv(os.path.join(FOLDER_PATH, 'data', 'ingredients_data.csv'), index=False)
prods_data.to_csv(os.path.join(FOLDER_PATH, 'data', 'products_data.csv'), index=False)