# Dataset exploration

In [25]:
import pandas as pd

# Load the dataset
file_path = 'nyka_top_brands_cosmetics_product_reviews.csv'
df = pd.read_csv(file_path)

# Number of Unique Products
unique_products_count = df['product_id'].nunique()

# Average Review Length (in words)
df['num_words_in_review'] = df['review_text'].fillna('').apply(lambda x: len(x.split()))
average_review_words_count = df['num_words_in_review'].mean()

# Number of Duplicate Reviews based on review_text
duplicate_reviews_count = df.duplicated(subset='review_text', keep=False).sum()

# Number of Unique Brands
unique_brands_count = df['brand_name'].nunique()

# Average Product Rating
average_product_rating_value = df['product_rating'].mean()

# Number of Verified Buyers
verified_buyer_count = df[df['is_a_buyer'] == True].shape[0]


# Convert review_date to a datetime format
df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce')

# Drop rows with invalid or missing dates
df = df.dropna(subset=['review_date'])

# Calculate the date range of reviews
start_date = df['review_date'].min()
end_date = df['review_date'].max()

# Missing Values
missing_values_counts = df.isnull().sum()

# Distribution of Review Ratings
review_rating_distribution = df['review_rating'].value_counts()

# Print the extracted statistics
print("Number of Unique Products:", unique_products_count)
print("Average Review Length:", average_review_words_count)
print("Number of Duplicate Reviews:", duplicate_reviews_count)
print("Number of Unique Brands:", unique_brands_count)
print("Average Product Rating:", average_product_rating_value)
print("Number of Verified Buyers:", verified_buyer_count)
print(f"Date Range of Reviews: {start_date} to {end_date}")
print("\nNumber of Missing Values:\n", missing_values_counts)
print("\nDistribution of Review Ratings:\n", review_rating_distribution)

Number of Unique Products: 295
Average Review Length: 21.608560146204557
Number of Duplicate Reviews: 1967
Number of Unique Brands: 11
Average Product Rating: 4.099130278702435
Number of Verified Buyers: 48222
Date Range of Reviews: 2013-05-20 16:48:00 to 2022-10-22 18:12:00

Number of Missing Values:
 product_id                  0
brand_name                  0
review_id                   0
review_title                0
review_text                 9
author                      0
review_date                 0
review_rating               1
is_a_buyer                  0
pro_user                    0
review_label            13035
product_title               0
mrp                         0
price                       0
product_rating              0
product_rating_count        0
product_tags            47782
product_url                 0
num_words_in_review         0
dtype: int64

Distribution of Review Ratings:
 review_rating
5.0    41626
4.0    11322
3.0     3540
1.0     3077
2.0     1718


In [26]:
top_brands = df['brand_name'].value_counts().head(2)
top_brands

brand_name
Nykaa Cosmetics    17652
Kay Beauty         13788
Name: count, dtype: int64

# Filtering & Cleaning Lipsticks from Nykaa Brand

In [27]:
# Filter for Nykaa Cosmetics products
nykaa_df = df[df['brand_name'] == 'Nykaa Cosmetics']

# Identify lip-related products based on product titles containing "lip"
nykaa_lip_related_df = nykaa_df[nykaa_df['product_title'].str.contains('lip', case=False, na=False)]

nykaa_lip_related_df.shape[0]

5261

In [28]:
# Filter to keep only necessary columns
filtered_df = nykaa_lip_related_df[['product_id', 'review_text', 'review_date', 'review_rating', 'product_title', 'is_a_buyer']]

# Remove duplicates based on review_text
filtered_df = filtered_df.drop_duplicates(subset='review_text')

# Remove missing reviews (NaN in review_text or review_rating)
filtered_df = filtered_df.dropna(subset=['review_text', 'review_rating'])

# Keep only reviews where is_a_buyer is True (i.e verified buyers)
filtered_df = filtered_df[filtered_df['is_a_buyer']]

# Drop the is_a_buyer column as it's no longer needed
final_df = filtered_df[['product_id', 'review_text', 'review_date', 'review_rating', 'product_title']]


# Sort the final_df by review_rating from highest to lowest
final_df = final_df.sort_values(by='review_rating', ascending=False)
final_df.to_csv('nykaa_filtered_lip_products.csv', index=False)
final_df.head()

Unnamed: 0,product_id,review_text,review_date,review_rating,product_title
1860,950973,I loved it perfect for dusky skin,2021-07-22 15:11:00,5.0,Nykaa Matte to Last! Mini Metallic Liquid Lips...
9273,422905,Blueberry lipbalm has a milk fragrance but wor...,2020-06-06 03:00:00,5.0,Nykaa Serial Kisser Lip Balm - Raspberry
9253,422905,I liked it. Moisturize lips but i dont like it...,2020-08-10 21:05:00,5.0,Nykaa Serial Kisser Lip Balm - Raspberry
9252,422905,Very light to apply..n very nourishing..,2020-12-02 18:04:00,5.0,Nykaa Serial Kisser Lip Balm - Raspberry
9251,422905,I love this lip balm.It moisturizes my lips we...,2020-08-07 14:53:00,5.0,Nykaa Serial Kisser Lip Balm - Raspberry


# Filtering & Cleaning Lipsticks from Kay Beauty Brand

In [29]:
# Filter for Nykaa Cosmetics products
kay_df = df[df['brand_name'] == 'Kay Beauty']

# Identify lip-related products based on product titles containing "lip"
kay_lip_related_df = kay_df[kay_df['product_title'].str.contains('lip', case=False, na=False)]

kay_lip_related_df.shape[0]

2412

In [30]:
# Filter to keep only necessary columns
filtered_df = kay_lip_related_df[['product_id', 'review_text', 'review_date', 'review_rating', 'product_title', 'is_a_buyer']]

# Remove duplicates based on review_text
filtered_df = filtered_df.drop_duplicates(subset='review_text')

# Remove missing reviews (NaN in review_text or review_rating)
filtered_df = filtered_df.dropna(subset=['review_text', 'review_rating'])

# Keep only reviews where is_a_buyer is True (i.e verified buyers)
filtered_df = filtered_df[filtered_df['is_a_buyer']]

# Drop the is_a_buyer column as it's no longer needed
final_df = filtered_df[['product_id', 'review_text', 'review_date', 'review_rating', 'product_title']]

# Sort the final_df by review_rating from highest to lowest
final_df = final_df.sort_values(by='review_rating', ascending=False)
final_df.to_csv('kay_filtered_lip_products.csv', index=False)
final_df.head()

Unnamed: 0,product_id,review_text,review_date,review_rating,product_title
48632,573572,"Long-lasting, doesn't smudge",2020-03-24 09:43:00,5.0,Kay Beauty Matte Action Lip Liner
49964,573181,"I love the smell, consistency, hydrating textu...",2021-03-18 01:48:00,5.0,Kay Beauty Matteinee Matte Lip Crayon Lipstick
49977,573181,Amazing shade and glides extremely smooth,2021-02-08 16:35:00,5.0,Kay Beauty Matteinee Matte Lip Crayon Lipstick
49976,573181,Shade is beautiful and is matte! Go for the pr...,2021-02-10 10:28:00,5.0,Kay Beauty Matteinee Matte Lip Crayon Lipstick
49975,573181,It's a very beautiful brown shade which looks ...,2021-02-15 16:44:00,5.0,Kay Beauty Matteinee Matte Lip Crayon Lipstick
