# Dataset exploration

In [10]:
import pandas as pd

# Load the dataset
file_path = 'nyka_top_brands_cosmetics_product_reviews.csv'
df = pd.read_csv(file_path)

# Number of Unique Products
unique_product_count = df['product_id'].nunique()

# Average Review Length (in words)
df['num_words_in_review'] = df['review_text'].fillna('').apply(lambda x: len(x.split()))
average_review_word_count = df['num_words_in_review'].mean()

# Number of Duplicate Reviews based on review_text
duplicate_review_count = df.duplicated(subset='review_text', keep=False).sum()

# Number of Unique Brands
unique_brand_count = df['brand_name'].nunique()

# Average Product Rating
average_product_rating_value = df['product_rating'].mean()

# Number of Verified Buyers
verified_buyer_count = df[df['review_label'] == 'Verified Buyer'].shape[0]

# Convert review_date to a datetime format
df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce')

# Drop rows with invalid or missing dates
df = df.dropna(subset=['review_date'])

# Calculate the date range of reviews
start_date = df['review_date'].min()
end_date = df['review_date'].max()

# Missing Values
missing_value_counts = df.isnull().sum()

# Distribution of Review Ratings
review_rating_distribution = df['review_rating'].value_counts()

# Print the extracted statistics
print("Number of Unique Products:", unique_product_count)
print("Average Review Length:", average_review_word_count)
print("Number of Duplicate Reviews:", duplicate_review_count)
print("Number of Unique Brands:", unique_brand_count)
print("Average Product Rating:", average_product_rating_value)
print("Number of Verified Buyers:", verified_buyer_count)
print(f"Date Range of Reviews: {start_date} to {end_date}")
print("\nNumber of Missing Values:\n", missing_value_counts)
print("\nDistribution of Review Ratings:\n", review_rating_distribution)

Number of Unique Products: 295
Average Review Length: 21.608560146204557
Number of Duplicate Reviews: 1967
Number of Unique Brands: 11
Average Product Rating: 4.099130278702435
Number of Verified Buyers: 47790
Date Range of Reviews: 2013-05-20 16:48:00 to 2022-10-22 18:12:00

Number of Missing Values:
 product_id                  0
brand_name                  0
review_id                   0
review_title                0
review_text                 9
author                      0
review_date                 0
review_rating               1
is_a_buyer                  0
pro_user                    0
review_label            13035
product_title               0
mrp                         0
price                       0
product_rating              0
product_rating_count        0
product_tags            47782
product_url                 0
num_words_in_review         0
dtype: int64

Distribution of Review Ratings:
 review_rating
5.0    41626
4.0    11322
3.0     3540
1.0     3077
2.0     1718
