In [6]:
# Import library yang diperlukan
import os
import pandas as pd

# Bikin variable pathnya biar gampang dipanggil
source_folder_path = r"C:\Users\N I T R O   5\Documents\Bootcamps\Purwadhika\Capstone\data_sources\data_sources\data_products"
output_folder_path = r"D:\PURWA\1"



In [7]:
# Fungsi buat extract data dari CSVnya dan di upload ke dataframe
def extract_data(file_path, delete_index=True):
    df = pd.read_csv(file_path)
    if delete_index:
        df.reset_index(drop=True, inplace=True)  # Remove index if required
    return df



In [8]:
# Fungsi buat transformasi data
def transform_data(df):
    # Delete simbol currency nya jadi cuma angka aja buat kolom actual price
    if 'actual_price' in df.columns:
        df['actual_price'] = df['actual_price'].replace('[^\d.]', '', regex=True)
        df['actual_price'] = pd.to_numeric(df['actual_price'], errors='coerce')  # Convert to numeric, kalau ada error dijadiin NaN
        df['actual_price'].fillna(0, inplace=True)  # ganti NaN dengan 0 
    
    # Delete simbol currency nya jadi cuma angka aja buat kolom discount price
    if 'discount_price' in df.columns:
        df['discount_price'] = df['discount_price'].replace('[^\d.]', '', regex=True)
        df['discount_price'] = pd.to_numeric(df['discount_price'], errors='coerce')  # Convert to float
        df['discount_price'].fillna(0, inplace=True)  # Replace NaN values with 0
    
    # Convert rating ke numeric
    if 'ratings' in df.columns:
        df['ratings'] = pd.to_numeric(df['ratings'], errors='coerce')  # Convert to numeric, kalau ada error dijadiin NaN
    
    # Transform the 'no_of_ratings' column
    if 'no_of_ratings' in df.columns:
    # Delete semua non numeric values, dan convert values nya ke numeric
        df['no_of_ratings'] = df['no_of_ratings'].replace('[^\d]', '', regex=True)
        df['no_of_ratings'] = pd.to_numeric(df['no_of_ratings'], errors='coerce').fillna(0)

    
    return df



In [9]:
# Fungsi untuk coba coba statistik simple
def basic_statistics(df):
    print("Basic Statistics:")
    print(df.describe())
    print("\n")

# Fungsi yang menganalisa values yang kosong
def missing_values_analysis(df):
    print("Missing Values Analysis:")
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    print(pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage}))
    print("\n")

# Fungsi menganalisa distribusi kategori
def categorical_distribution(df, column_name):
    if column_name in df.columns:
        print(f"Distribution of {column_name}:")
        print(df[column_name].value_counts())
        print("\n")

# Fungsi untuk aggregasi atau penjumlahan datanya
def numeric_aggregation(df, column_name):
    if column_name in df.columns and pd.api.types.is_numeric_dtype(df[column_name]):
        print(f"Total {column_name}: {df[column_name].sum()}")
        print(f"Average {column_name}: {df[column_name].mean()}")
        print("\n")

#Fungsi untuk ngedisplay top N values di kolomnya
def top_n_analysis(df, column_name, n=5):
    if column_name in df.columns:
        print(f"Top {n} based on {column_name}:")
        print(df.nlargest(n, column_name))
        print("\n")

# Fungsi korelasi analisis
def correlation_analysis(df):
    print("Correlation Matrix:")
    correlation = df.corr()
    print(correlation)
    print("\n")

# # Fungsi untuk deteksi outlier
# def outlier_detection(df, column_name, threshold):
#     if column_name in df.columns and pd.api.types.is_numeric_dtype(df[column_name]):
#         outliers = df[df[column_name] > threshold]
#         print(f"Number of outliers in {column_name} greater than {threshold}: {len(outliers)}")
#         print(outliers)
#         print("\n")

# Fungsi untuk cek datanya lengkap atau engga
def data_completeness(df):
    total_rows = df.shape[0]
    if total_rows == 0:
        print("The DataFrame is empty, data completeness cannot be calculated.")
    else:
        complete_rows = df.dropna().shape[0]
        completeness_percentage = (complete_rows / total_rows) * 100
        print(f"Percentage of complete rows: {completeness_percentage:.2f}%")
    print("\n")


# Fungsi top 3 product dengan rating tertinggi
def top_3_highest_rating(df):
    if 'ratings' in df.columns:
        print("Top 3 Products with the Highest Rating:")
        top_3_ratings = df.nlargest(3, 'ratings')
        print(top_3_ratings[['name', 'ratings']])
        print("\n")

# Fungsi top 3 product dengan rating terbanyak
def top_3_most_ratings(df):
    if 'no_of_ratings' in df.columns:
        print("Top 3 Products with the Most Ratings:")
        top_3_most = df.nlargest(3, 'no_of_ratings')
        print(top_3_most[['name', 'no_of_ratings']])
        print("\n")


# Funsgi untuk melihat product dengan rating terbanyak dan juga di sort dari yang tertinggi ratingannya buat top 3 aja
def top_3_most_ratings_sorted(df):
    if 'no_of_ratings' in df.columns:
        print("Top 3 Products with the Most Ratings, Sorted by Highest:")
        sorted_ratings = df.sort_values(by='no_of_ratings', ascending=False).head(3)
        print(sorted_ratings[['name', 'no_of_ratings']])
        print("\n")



In [10]:
# Fungsi ETL utama dan analisis demography 
def main():
    # Get the list of all CSV files in the source folder
    files = [file for file in os.listdir(source_folder_path) if file.endswith('.csv')]

    # Create output directory if it doesn't exist
    os.makedirs(output_folder_path, exist_ok=True)

    # Process tiap file
    for file in files:
        file_path = os.path.join(source_folder_path, file)
        
        # Step 1: Data Extraction
        df = extract_data(file_path)
        
        # Step 2: Data Transformation
        df = transform_data(df)
        
        # Step 3: Data Demography Analysis
        print(f"=== Analysis for {file} ===")
        print("=== Basic Statistics ===")
        basic_statistics(df)
        
        print("=== Missing Values Analysis ===")
        missing_values_analysis(df)
        
        # print("=== Categorical Data Distribution ===")
        # categorical_distribution(df, "category_column")  
        
        # print("=== Numeric Data Aggregation ===")
        # numeric_aggregation(df, "price_column")  
        
        # print("=== Top N Analysis ===")
        # top_n_analysis(df, "price_column", 5) 
        
        print("=== Correlation Analysis ===")
        correlation_analysis(df)
        
        # print("=== Outlier Detection ===")
        # outlier_detection(df, "price_column", 10000)  
        
        print("=== Data Completeness ===")
        data_completeness(df)
        
        print("=== Top 3 Products with the Highest Rating ===")
        top_3_highest_rating(df)
        
        print("=== Top 3 Products with the Most Ratings ===")
        top_3_most_ratings(df)
        
        print("=== Top 3 Products with the Most Ratings, Sorted ===")
        top_3_most_ratings_sorted(df)
        
        # Save the transformed data
        output_file_path = os.path.join(output_folder_path, f"Cleaned_{file}")
        df.to_csv(output_file_path, index=False)
        print(f"Transformed data saved for {file}.\n")

# Run the main function
if __name__ == "__main__":
    main()


=== Analysis for Air Conditioners.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price   actual_price
count  423.000000     720.000000      720.000000     720.000000
mean     3.809693     144.087500    25868.743056   38066.130028
std      0.789610     564.281521    22969.750811   31589.188305
min      1.000000       0.000000        0.000000       0.000000
25%      3.500000       0.000000        0.000000       0.000000
50%      4.000000       2.000000    32999.000000   47045.000000
75%      4.200000      40.000000    41999.250000   61990.000000
max      5.000000    9577.000000   128800.000000  149000.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0        0.00
main_category                0        0.00
sub_category                 0        0.00
image                        0        0.00
link                         0        0.00
ratings               

  correlation = df.corr()


                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings                    538    5.618212
no_of_ratings                0    0.000000
discount_price               0    0.000000
actual_price                 0    0.000000


=== Correlation Analysis ===
Correlation Matrix:
                 ratings  no_of_ratings  discount_price  actual_price
ratings         1.000000       0.060954        0.114363      0.114453
no_of_ratings   0.060954       1.000000        0.008429      0.003657
discount_price  0.114363       0.008429        1.000000      0.964601
actual_price    0.114453       0.003657        0.964601      1.000000


=== Data Completeness ===
Percentage of complete rows: 94.38%


=== Top 3 Products with the Highest Rating ===
Top 3 Products with the Highest Rating:
                

  correlation = df.corr()


Transformed data saved for All Appliances.csv.

=== Analysis for All Books.csv ===
=== Basic Statistics ===
Basic Statistics:
       ratings  no_of_ratings  discount_price  actual_price
count      0.0            0.0             0.0           0.0
mean       NaN            NaN             NaN           NaN
std        NaN            NaN             NaN           NaN
min        NaN            NaN             NaN           NaN
25%        NaN            NaN             NaN           NaN
50%        NaN            NaN             NaN           NaN
75%        NaN            NaN             NaN           NaN
max        NaN            NaN             NaN           NaN


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                       0.0         NaN
main_category              0.0         NaN
sub_category               0.0         NaN
image                      0.0         NaN
link                       0.0         NaN
ratings          

  correlation = df.corr()
  correlation = df.corr()


Transformed data saved for All Car and Motorbike Products.csv.

=== Analysis for All Electronics.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price   actual_price
count  9481.000000    9600.000000     9600.000000    9600.000000
mean      4.077576    8771.017396     2826.221698    4973.169204
std       0.377189   27962.923548     8663.594724   12171.869818
min       1.000000       0.000000        0.000000       0.000000
25%       3.900000     220.750000      270.000000     799.000000
50%       4.100000    1136.000000      499.000000    1399.500000
75%       4.300000    4992.000000     1499.000000    3299.000000
max       5.000000  437651.000000   134999.000000  161999.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


Transformed data saved for All Electronics.csv.

=== Analysis for All English.csv ===
=== Basic Statistics ===
Basic Statistics:
       ratings  no_of_ratings  discount_price  actual_price
count      0.0            0.0             0.0           0.0
mean       NaN            NaN             NaN           NaN
std        NaN            NaN             NaN           NaN
min        NaN            NaN             NaN           NaN
25%        NaN            NaN             NaN           NaN
50%        NaN            NaN             NaN           NaN
75%        NaN            NaN             NaN           NaN
max        NaN            NaN             NaN           NaN


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                       0.0         NaN
main_category              0.0         NaN
sub_category               0.0         NaN
image                      0.0         NaN
link                       0.0         NaN
ratings       

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


Empty DataFrame
Columns: [name, no_of_ratings]
Index: []


=== Top 3 Products with the Most Ratings, Sorted ===
Top 3 Products with the Most Ratings, Sorted by Highest:
Empty DataFrame
Columns: [name, no_of_ratings]
Index: []


Transformed data saved for All Movies and TV Shows.csv.

=== Analysis for All Music.csv ===
=== Basic Statistics ===
Basic Statistics:
       ratings  no_of_ratings  discount_price  actual_price
count      0.0            0.0             0.0           0.0
mean       NaN            NaN             NaN           NaN
std        NaN            NaN             NaN           NaN
min        NaN            NaN             NaN           NaN
25%        NaN            NaN             NaN           NaN
50%        NaN            NaN             NaN           NaN
75%        NaN            NaN             NaN           NaN
max        NaN            NaN             NaN           NaN


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percen

  correlation = df.corr()
  correlation = df.corr()


                 ratings  no_of_ratings  discount_price  actual_price
ratings         1.000000       0.084901        0.120537      0.078588
no_of_ratings   0.084901       1.000000       -0.013662     -0.041432
discount_price  0.120537      -0.013662        1.000000      0.823060
actual_price    0.078588      -0.041432        0.823060      1.000000


=== Data Completeness ===
Percentage of complete rows: 91.07%


=== Top 3 Products with the Highest Rating ===
Top 3 Products with the Highest Rating:
                                                  name  ratings
299  Saanchi Ecommerce Women's Georgette Sarees ($$...      5.0
329  DIY Crafts Watch Spring Bars, 360Pcs 8-25mm Pr...      5.0
461   Trolls: Water-Color!: Talent Show (Media Tie-In)      5.0


=== Top 3 Products with the Most Ratings ===
Top 3 Products with the Most Ratings:
                                                   name  no_of_ratings
1112              crocs Unisex Crocband Clogs and Mules       157480.0
41    Pigeon b

  correlation = df.corr()


Percentage of complete rows: 67.00%


=== Top 3 Products with the Highest Rating ===
Top 3 Products with the Highest Rating:
                                                 name  ratings
58  Hitachi Split Ac - 1.5 Ton Kiyora 5200Fx I Fre...      5.0
89  Panasonic 2 Ton 5 Star Wi-Fi Inverter Smart Sp...      5.0
97  Panasonic 1.5 Ton 3 Star Hot and Cold Wi-Fi In...      5.0


=== Top 3 Products with the Most Ratings ===
Top 3 Products with the Most Ratings:
                                                    name  no_of_ratings
60737  SanDisk 128GB Class 10 microSDXC Memory Card w...       589547.0
60917  SanDisk 16GB Ultra MicroSDHC Memory Card (SDSQ...       589547.0
61071  SanDisk Ultra 64GB UHS-I Class 10 Micro SD Mem...       589547.0


=== Top 3 Products with the Most Ratings, Sorted ===
Top 3 Products with the Most Ratings, Sorted by Highest:
                                                    name  no_of_ratings
71417  SanDisk UHS-I A1 98Mbps 32GB Ultra MicroSD Mem...       589

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


Transformed data saved for Baby Fashion.csv.

=== Analysis for Baby Products.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price  actual_price
count  1040.000000    1056.000000     1056.000000   1056.000000
mean      4.089423    6274.969697      614.920862   1180.159091
std       0.400750   23666.455532      730.444253   1272.471373
min       1.000000       0.000000        0.000000      0.000000
25%       3.900000      66.000000      246.750000    470.000000
50%       4.100000     378.000000      429.000000    890.000000
75%       4.300000    2439.000000      699.000000   1458.750000
max       5.000000  215914.000000     8798.000000  14999.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                  

  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Bags and Luggage.csv ===
=== Basic Statistics ===
Basic Statistics:
            ratings  no_of_ratings  discount_price  actual_price
count  16709.000000   19152.000000    19152.000000  19152.000000
mean       4.002645     332.915518     1019.017214   2506.349286
std        0.590637    2127.942525     1524.669210   3501.504748
min        1.000000       0.000000        0.000000      0.000000
25%        3.800000       3.000000      299.000000    799.000000
50%        4.000000      23.000000      541.000000   1499.000000
75%        4.300000     133.000000     1099.000000   2749.000000
max        5.000000  182448.000000    33540.000000  49990.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings      

  correlation = df.corr()


Transformed data saved for Bags and Luggage.csv.

=== Analysis for Ballerinas.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price  actual_price
count  462.000000    1392.000000     1392.000000   1392.000000
mean     3.541558      26.879310      999.410761   2869.388441
std      1.053743     270.892648     2959.690382   6280.061541
min      1.000000       0.000000        0.000000      0.000000
25%      3.000000       0.000000        0.000000    679.000000
50%      3.700000       0.000000      420.000000    999.000000
75%      4.175000       2.000000      749.000000   2199.000000
max      5.000000    8483.000000    43759.000000  87518.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


                Missing Values  Percentage
name                       0.0         NaN
main_category              0.0         NaN
sub_category               0.0         NaN
image                      0.0         NaN
link                       0.0         NaN
ratings                    0.0         NaN
no_of_ratings              0.0         NaN
discount_price             0.0         NaN
actual_price               0.0         NaN


=== Correlation Analysis ===
Correlation Matrix:
                ratings  no_of_ratings  discount_price  actual_price
ratings             NaN            NaN             NaN           NaN
no_of_ratings       NaN            NaN             NaN           NaN
discount_price      NaN            NaN             NaN           NaN
actual_price        NaN            NaN             NaN           NaN


=== Data Completeness ===
The DataFrame is empty, data completeness cannot be calculated.


=== Top 3 Products with the Highest Rating ===
Top 3 Products with the Highest R

  correlation = df.corr()


                 ratings  no_of_ratings  discount_price  actual_price
ratings         1.000000       0.058690        0.134935      0.138743
no_of_ratings   0.058690       1.000000        0.019926      0.044584
discount_price  0.134935       0.019926        1.000000      0.833181
actual_price    0.138743       0.044584        0.833181      1.000000


=== Data Completeness ===
Percentage of complete rows: 83.36%


=== Top 3 Products with the Highest Rating ===
Top 3 Products with the Highest Rating:
                                                  name  ratings
36   Portronics Clean M Multifunctional 8-in-1 Gadg...      5.0
92   MKY Tripod 3110 Stand with 3-Way Head Tripod 3...      5.0
141       Marcelle microSDXC Ultra Card 128GB, 140MB/s      5.0


=== Top 3 Products with the Most Ratings ===
Top 3 Products with the Most Ratings:
                                                  name  no_of_ratings
377  SanDisk 128GB Class 10 microSDXC Memory Card w...       589547.0
557  SanDisk 16G

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


Transformed data saved for Camping and Hiking.csv.

=== Analysis for Car Accessories.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price  actual_price
count  1101.000000    1392.000000     1392.000000   1392.000000
mean      3.804541      77.816810      914.004088   1760.084411
std       0.748135     424.932833     1066.154567   1731.957173
min       1.000000       0.000000        0.000000      0.000000
25%       3.400000       1.000000      308.500000    699.000000
50%       3.800000       7.000000      649.000000   1249.000000
75%       4.200000      36.000000     1156.500000   2025.000000
max       5.000000    8798.000000    17500.000000  19900.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link          

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


Transformed data saved for Car Parts.csv.

=== Analysis for Cardio Equipment.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price   actual_price
count  136.000000     240.000000      240.000000     240.000000
mean     3.869853      46.104167    23414.405583   38727.987500
std      0.783943     142.120704    36430.989256   56335.848801
min      1.000000       0.000000        0.000000       0.000000
25%      3.600000       0.000000      757.250000    1416.000000
50%      4.000000       1.000000     8599.000000   17416.000000
75%      4.300000      25.500000    30367.250000   53150.000000
max      5.000000    1103.000000   225250.000000  365000.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                  

  correlation = df.corr()
  correlation = df.corr()


Transformed data saved for Casual Shoes.csv.

=== Analysis for Childrens Books.csv ===
=== Basic Statistics ===
Basic Statistics:
       ratings  no_of_ratings  discount_price  actual_price
count      0.0            0.0             0.0           0.0
mean       NaN            NaN             NaN           NaN
std        NaN            NaN             NaN           NaN
min        NaN            NaN             NaN           NaN
25%        NaN            NaN             NaN           NaN
50%        NaN            NaN             NaN           NaN
75%        NaN            NaN             NaN           NaN
max        NaN            NaN             NaN           NaN


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                       0.0         NaN
main_category              0.0         NaN
sub_category               0.0         NaN
image                      0.0         NaN
link                       0.0         NaN
ratings      

  correlation = df.corr()


Transformed data saved for Clothing.csv.

=== Analysis for Coffee Tea and Beverages.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price  actual_price
count  1121.000000    1296.000000     1296.000000   1296.000000
mean      4.073595     310.781636      396.181674    634.135802
std       0.600148    1825.351012      574.102833    828.454674
min       1.000000       0.000000        0.000000      0.000000
25%       3.800000       3.000000        0.000000    268.750000
50%       4.100000      18.000000      278.500000    420.000000
75%       4.400000     100.250000      499.000000    699.000000
max       5.000000   50629.000000     7900.000000  12000.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link           

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


Transformed data saved for Diapers.csv.

=== Analysis for Diet and Nutrition.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price  actual_price
count  1132.000000    1200.000000     1200.000000   1200.000000
mean      4.102473     357.945000      672.544033   1103.720000
std       0.434979    2413.715502      903.224666   1455.136957
min       1.000000       0.000000        0.000000      0.000000
25%       3.900000      11.000000      192.750000    299.000000
50%       4.100000      50.500000      378.000000    600.000000
75%       4.300000     171.250000      761.250000   1296.750000
max       5.000000   75344.000000     8799.000000  16999.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                  

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Ethnic Wear.csv ===
=== Basic Statistics ===
Basic Statistics:
            ratings  no_of_ratings  discount_price  actual_price
count  16034.000000   19056.000000    19056.000000  19056.000000
mean       3.728321     154.670235      782.920510   2502.175970
std        0.678725     690.165623      632.154627   1861.355952
min        1.000000       0.000000        0.000000      0.000000
25%        3.400000       2.000000      399.000000   1299.000000
50%        3.800000      12.000000      649.000000   1999.000000
75%        4.000000      72.000000      959.000000   2999.000000
max        5.000000   39722.000000    22999.000000  49999.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings           

  correlation = df.corr()
  correlation = df.corr()


Empty DataFrame
Columns: [name, no_of_ratings]
Index: []


=== Top 3 Products with the Most Ratings, Sorted ===
Top 3 Products with the Most Ratings, Sorted by Highest:
Empty DataFrame
Columns: [name, no_of_ratings]
Index: []


Transformed data saved for Exam Central.csv.

=== Analysis for Fashion and Silver Jewellery.csv ===
=== Basic Statistics ===
Basic Statistics:
            ratings  no_of_ratings  discount_price    actual_price
count  15335.000000   19104.000000    19104.000000    19104.000000
mean       3.898259      95.838673      822.673677     2286.463961
std        0.703292     406.723813     4625.125380     8498.504272
min        1.000000       0.000000        0.000000        0.000000
25%        3.600000       1.000000      271.000000      999.000000
50%        3.900000       7.000000      399.000000     1500.000000
75%        4.300000      40.000000      698.000000     2598.000000
max        5.000000   15966.000000   499999.000000  1000000.000000


=== Missing Values Analy

  correlation = df.corr()


Transformed data saved for Fashion and Silver Jewellery.csv.

=== Analysis for Fashion Sales and Deals.csv ===
=== Basic Statistics ===
Basic Statistics:
         ratings  no_of_ratings  discount_price  actual_price
count  43.000000      44.000000       44.000000     44.000000
mean    3.797674      54.022727      844.090909   3983.052500
std     0.527116     109.944698      697.259680   4030.036034
min     2.800000       0.000000        0.000000    899.000000
25%     3.450000      12.750000      342.500000   1499.000000
50%     3.700000      31.500000      573.500000   2999.000000
75%     4.000000      53.250000     1347.500000   3999.000000
max     5.000000     731.000000     2190.000000  21054.290000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link          

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


                 ratings  no_of_ratings  discount_price  actual_price
ratings         1.000000       0.040518       -0.017884      0.016068
no_of_ratings   0.040518       1.000000       -0.024587     -0.013916
discount_price -0.017884      -0.024587        1.000000      0.756697
actual_price    0.016068      -0.013916        0.756697      1.000000


=== Data Completeness ===
Percentage of complete rows: 79.74%


=== Top 3 Products with the Highest Rating ===
Top 3 Products with the Highest Rating:
                                                 name  ratings
33                      Captain Lead to Win sleeve+03      5.0
38            The Modern Soul Men's Jogger Track Pant      5.0
46  Dozit Sports Star PU Football with Free Pump a...      5.0


=== Top 3 Products with the Most Ratings ===
Top 3 Products with the Most Ratings:
                                              name  no_of_ratings
14   Allen Solly Men Cotton Hooded Neck Sweatshirt        11997.0
417                       ad

  correlation = df.corr()


=== Analysis for Furniture.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price   actual_price
count  1173.000000    1320.000000     1320.000000    1320.000000
mean      3.886701     248.552273     2456.171977    4857.717583
std       0.686518    1098.279440     4821.284465    9016.329314
min       1.000000       0.000000        0.000000       0.000000
25%       3.600000       4.000000      429.000000     999.000000
50%       4.000000      31.000000      901.070000    1999.000000
75%       4.300000     167.000000     2199.000000    4599.250000
max       5.000000   31979.000000    60500.000000  100000.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings             

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Gold and Diamond Jewellery.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price   actual_price
count  1531.000000   19104.000000    19104.000000   19104.000000
mean      3.922730       1.942735    24311.139032   31723.842916
std       1.100056      42.004444    29033.986811   34517.556538
min       1.000000       0.000000        0.000000       0.000000
25%       3.400000       0.000000     6229.000000   11265.500000
50%       4.100000       0.000000    17800.000000   24505.500000
75%       5.000000       0.000000    30743.000000   41540.000000
max       5.000000    4392.000000   468012.000000  700300.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
rat

  correlation = df.corr()


Transformed data saved for Gold and Diamond Jewellery.csv.

=== Analysis for Handbags and Clutches.csv ===
=== Basic Statistics ===
Basic Statistics:
            ratings  no_of_ratings  discount_price  actual_price
count  13387.000000   19104.000000    19104.000000  19104.000000
mean       3.904982      83.284966      960.798903   2514.129344
std        0.794102     698.929818     1211.202187   2444.681001
min        1.000000       0.000000        0.000000      0.000000
25%        3.600000       0.000000      398.000000    999.000000
50%        4.000000       3.000000      649.000000   1899.000000
75%        4.400000      19.000000     1189.000000   2999.000000
max        5.000000   67247.000000    41138.000000  43225.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0     0.00000
main_category                0     0.00000
sub_category                 0     0.00000
image                        0    

  correlation = df.corr()


Transformed data saved for Handbags and Clutches.csv.

=== Analysis for Headphones.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price   actual_price
count  6893.000000    9600.000000     9600.000000    9600.000000
mean      3.503177    3619.535417     1512.598909    3115.122499
std       0.896734   21695.761103     3988.408962    5737.462181
min       1.000000       0.000000        0.000000       0.000000
25%       3.100000       0.000000      330.000000     799.000000
50%       3.600000       6.000000      590.000000    1499.000000
75%       4.000000     118.000000     1192.500000    2999.000000
max       5.000000  375110.000000   139989.000000  154990.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link   

  correlation = df.corr()
  correlation = df.corr()


Transformed data saved for Headphones.csv.

=== Analysis for Health and Personal Care.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price  actual_price
count  1098.000000    1104.000000     1104.000000   1104.000000
mean      4.174317    3590.625906      481.001350    747.379529
std       0.297690    8451.388251     1855.095522   2098.742418
min       2.400000       0.000000        0.000000      0.000000
25%       4.000000     350.750000      174.750000    250.000000
50%       4.200000    1259.000000      291.000000    412.000000
75%       4.400000    3346.500000      499.000000    799.000000
max       5.000000  155342.000000    56999.000000  61999.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link         

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


                 ratings  no_of_ratings  discount_price  actual_price
ratings         1.000000       0.089220        0.097349      0.106339
no_of_ratings   0.089220       1.000000        0.014908      0.037176
discount_price  0.097349       0.014908        1.000000      0.961286
actual_price    0.106339       0.037176        0.961286      1.000000


=== Data Completeness ===
Percentage of complete rows: 81.01%


=== Top 3 Products with the Highest Rating ===
Top 3 Products with the Highest Rating:
                                                  name  ratings
209  LUCHILA Alert Mini Portable Air Cooler,Persona...      5.0
252  STARDOM Eagle Ultra High speed Smoke Brown 24 ...      5.0
302  Unleash Storm 6 inch Exhaust Fan For Kitchen, ...      5.0


=== Top 3 Products with the Most Ratings ===
Top 3 Products with the Most Ratings:
                                                   name  no_of_ratings
1326    Pure Enrichment Ultrasonic Cool Mist Humidifier        83178.0
628   Bounty P

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


Transformed data saved for Home Improvement.csv.

=== Analysis for Home Storage.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price  actual_price
count  1087.000000    1224.000000     1224.000000   1224.000000
mean      3.939006     360.043301      682.226748   1460.792157
std       0.617580    1220.349784      958.077727   2109.234598
min       1.000000       0.000000        0.000000      0.000000
25%       3.700000       4.000000      249.000000    549.000000
50%       4.000000      31.000000      399.000000    998.500000
75%       4.300000     175.000000      749.000000   1499.250000
max       5.000000   16743.000000    11679.000000  37800.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0     0.00000
main_category                0     0.00000
sub_category                 0     0.00000
image                        0     0.00000
link               

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


Transformed data saved for Indoor Lighting.csv.

=== Analysis for Industrial and Scientific Supplies.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price  actual_price
count  603.000000     624.000000      624.000000    624.000000
mean     3.969486     840.280449      725.020978   1601.051651
std      0.444267    3407.486334     1849.235952   3916.610357
min      2.500000       0.000000        0.000000      0.000000
25%      3.700000      23.750000      195.750000    423.750000
50%      4.000000      97.000000      318.500000    799.000000
75%      4.200000     438.500000      599.000000   1462.250000
max      5.000000   62649.000000    25959.000000  69900.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link   

  correlation = df.corr()


=== Analysis for Innerwear.csv ===
=== Basic Statistics ===
Basic Statistics:
            ratings  no_of_ratings  discount_price  actual_price
count  11147.000000   19152.000000    19152.000000  19152.000000
mean       3.784005     128.835109      460.318818    968.505607
std        0.835840    1945.009240      612.087935   1140.520253
min        1.000000       0.000000        0.000000      0.000000
25%        3.400000       0.000000      229.000000    499.000000
50%        3.900000       1.000000      389.000000    719.000000
75%        4.200000      13.000000      579.000000    999.000000
max        5.000000  219589.000000    17546.190000  35092.380000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings             

  correlation = df.corr()


Transformed data saved for Innerwear.csv.

=== Analysis for International Music.csv ===
=== Basic Statistics ===
Basic Statistics:
       ratings  no_of_ratings  discount_price  actual_price
count      0.0            0.0             0.0           0.0
mean       NaN            NaN             NaN           NaN
std        NaN            NaN             NaN           NaN
min        NaN            NaN             NaN           NaN
25%        NaN            NaN             NaN           NaN
50%        NaN            NaN             NaN           NaN
75%        NaN            NaN             NaN           NaN
max        NaN            NaN             NaN           NaN


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                       0.0         NaN
main_category              0.0         NaN
sub_category               0.0         NaN
image                      0.0         NaN
link                       0.0         NaN
ratings     

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Jeans.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price  actual_price
count  5909.000000   19200.000000    19200.000000  19200.000000
mean      3.555830      37.254010     1285.302347   2674.690986
std       0.952459     954.044058     1116.283192   1859.596460
min       1.000000       0.000000        0.000000      0.000000
25%       3.100000       0.000000      685.000000   1599.000000
50%       3.600000       0.000000     1199.000000   2595.000000
75%       4.000000       1.000000     1765.515000   3499.000000
max       5.000000   94676.000000    25830.000000  40495.950000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings                  13291   

  correlation = df.corr()


Transformed data saved for Jeans.csv.

=== Analysis for Jewellery.csv ===
=== Basic Statistics ===
Basic Statistics:
            ratings  no_of_ratings  discount_price  actual_price
count  11552.000000   19152.000000    19152.000000  1.915200e+04
mean       3.813452      36.173820     1956.448994  4.089435e+03
std        0.975573     369.761778    10225.583672  1.981297e+04
min        1.000000       0.000000        0.000000  0.000000e+00
25%        3.300000       0.000000      289.000000  8.990000e+02
50%        4.000000       1.000000      499.000000  1.500000e+03
75%        4.500000       6.000000     1036.750000  2.999000e+03
max        5.000000   27672.000000   499999.000000  1.200000e+06


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0     0.00000
main_category                0     0.00000
sub_category                 0     0.00000
image                        0     0.00000
link                    

  correlation = df.corr()


Transformed data saved for Jewellery.csv.

=== Analysis for Kids Clothing.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price  actual_price
count  1333.000000    2064.000000     2064.000000   2064.000000
mean      3.886197     119.046512      582.410635   1178.109956
std       0.661810     587.775580      507.072407    841.138362
min       1.000000       0.000000        0.000000      0.000000
25%       3.600000       0.000000      331.000000    699.000000
50%       4.000000       4.000000      499.000000    999.000000
75%       4.200000      45.250000      714.000000   1400.000000
max       5.000000   19564.000000    12350.000000  17649.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                     

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


Transformed data saved for Kids Shoes.csv.

=== Analysis for Kids Watches.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price  actual_price
count  230.000000    2016.000000     2016.000000   2016.000000
mean     3.538261      18.156250      615.250382   1161.391096
std      0.980472     302.328574      402.089095    559.998977
min      1.000000       0.000000        0.000000      0.000000
25%      3.000000       0.000000      359.000000    990.000000
50%      3.700000       0.000000      599.000000    990.000000
75%      4.100000       0.000000      699.000000   1295.000000
max      5.000000    7703.000000     8236.770000   9499.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0     0.00000
main_category                0     0.00000
sub_category                 0     0.00000
image                        0     0.00000
link                         0    

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Kitchen and Home Appliances.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price  actual_price
count  8811.000000    9600.000000     9600.000000   9600.000000
mean      3.872773     721.000208     2781.317632   4743.826460
std       0.609389    3569.554436     5845.512531   9345.361636
min       1.000000       0.000000        0.000000      0.000000
25%       3.600000       8.000000      349.000000    800.000000
50%       3.900000      55.000000      890.000000   1799.000000
75%       4.200000     309.000000     2619.250000   4380.000000
max       5.000000  128941.000000    66990.000000  99999.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0     0.00000
main_category                0     0.00000
sub_category                 0     0.00000
image                        0     0.00000
link                         0     0.00000
ratings    

  correlation = df.corr()
  correlation = df.corr()


                 ratings  no_of_ratings  discount_price  actual_price
ratings         1.000000       0.072634        0.078412      0.046338
no_of_ratings   0.072634       1.000000       -0.020105     -0.048760
discount_price  0.078412      -0.020105        1.000000      0.845907
actual_price    0.046338      -0.048760        0.845907      1.000000


=== Data Completeness ===
Percentage of complete rows: 93.22%


=== Top 3 Products with the Highest Rating ===
Top 3 Products with the Highest Rating:
                                                  name  ratings
166  KRISHTAL TRADING Unbreakable Water Bottle 1 Li...      5.0
240  AB99 COLLECTION Steel 3-Tier Storage Corners R...      5.0
298  JUGTE Water Bottle for Kids Stainless Steel Bo...      5.0


=== Top 3 Products with the Most Ratings ===
Top 3 Products with the Most Ratings:
                                                 name  no_of_ratings
40  Amazon Brand - Solimo Revolving Plastic Spice ...        21701.0
39  Treo By Milton

  correlation = df.corr()


Transformed data saved for Lingerie and Nightwear.csv.

=== Analysis for Luxury Beauty.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price  actual_price
count  516.000000     864.000000      864.000000    864.000000
mean     4.079457     319.538194      975.594722   2230.933461
std      0.739670    3132.846846     1598.816956   2320.355586
min      1.000000       0.000000        0.000000      0.000000
25%      3.875000       0.000000        0.000000    699.750000
50%      4.200000       2.000000      473.000000   1427.500000
75%      4.500000      72.250000     1200.000000   2800.000000
max      5.000000   84763.000000    13579.000000  18999.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                 

  correlation = df.corr()
  correlation = df.corr()


Transformed data saved for Make-up.csv.

=== Analysis for Mens Fashion.csv ===
=== Basic Statistics ===
Basic Statistics:
            ratings  no_of_ratings  discount_price  actual_price
count  17242.000000   19200.000000    19200.000000  1.920000e+04
mean       3.885373     564.600208     1139.508540  2.607665e+03
std        0.577820    2613.596938     5333.678049  1.297983e+04
min        1.000000       0.000000        0.000000  0.000000e+00
25%        3.600000       6.000000      287.000000  9.500000e+02
50%        3.900000      47.000000      499.000000  1.499000e+03
75%        4.200000     281.250000      931.000000  2.499000e+03
max        5.000000  157476.000000   406009.000000  1.200000e+06


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link               

  correlation = df.corr()


Transformed data saved for Mens Fashion.csv.

=== Analysis for Motorbike Accessories and Parts.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price  actual_price
count  990.000000    1224.000000     1224.000000   1224.000000
mean     3.732525     124.901961      645.531781   1352.700139
std      0.783121     858.565226      890.194350   1597.773409
min      1.000000       0.000000        0.000000      0.000000
25%      3.400000       1.000000      229.000000    500.000000
50%      3.800000       8.000000      358.500000    900.000000
75%      4.200000      40.000000      699.000000   1499.000000
max      5.000000   19554.000000     9909.000000  17999.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link         

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Personal Care Appliances.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price   actual_price
count  356.000000    1224.000000     1224.000000    1224.000000
mean     3.455056     167.805556     1195.045980    2164.664273
std      1.140868    1663.687670     3772.704905    5076.593805
min      1.000000       0.000000        0.000000       0.000000
25%      3.000000       0.000000      299.000000     899.000000
50%      3.600000       0.000000      598.500000    1299.000000
75%      4.300000       2.000000     1041.500000    2093.000000
max      5.000000   43682.000000    95000.000000  120000.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings       

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Running.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price  actual_price
count  763.000000     912.000000      912.000000  9.120000e+02
mean     3.811533     291.122807      835.751458  6.877669e+04
std      0.682591    1316.073132     1020.148739  2.022601e+06
min      1.000000       0.000000        0.000000  0.000000e+00
25%      3.600000       2.000000      339.000000  9.290000e+02
50%      3.900000      19.000000      538.000000  1.299000e+03
75%      4.100000     121.250000      899.000000  1.999000e+03
max      5.000000   26369.000000     8159.000000  6.108299e+07


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings                    149   16.3377

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Security Cameras.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price   actual_price
count  4059.000000    9576.000000     9576.000000    9576.000000
mean      3.319143     175.525689     4270.130966    8178.820432
std       1.094034    3338.542993     7452.679279   13118.765746
min       1.000000       0.000000        0.000000       0.000000
25%       2.800000       0.000000      729.750000    1999.000000
50%       3.400000       0.000000     1868.500000    4000.000000
75%       4.000000       4.000000     4235.500000    7999.000000
max       5.000000   99243.000000   159999.000000  199999.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings      

  correlation = df.corr()


                 ratings  no_of_ratings  discount_price  actual_price
ratings         1.000000       0.032649        0.025488      0.022564
no_of_ratings   0.032649       1.000000        0.027639      0.033665
discount_price  0.025488       0.027639        1.000000      0.901799
actual_price    0.022564       0.033665        0.901799      1.000000


=== Data Completeness ===
Percentage of complete rows: 85.38%


=== Top 3 Products with the Highest Rating ===
Top 3 Products with the Highest Rating:
                                                 name  ratings
6   CHIMMET Double Sided Tape Heavy Duty - Multipu...      5.0
75  GOODIEBOX 5 Pieces Painting Knives Stainless S...      5.0
79  Indiginous Borosilate Glass Milk Lactometer fo...      5.0


=== Top 3 Products with the Most Ratings ===
Top 3 Products with the Most Ratings:
                                                  name  no_of_ratings
814  Nano Double Sided Tape, EZlifego Traceless Was...        81051.0
21   Crafts 4 ALL Ac

  correlation = df.corr()


Transformed data saved for Shirts.csv.

=== Analysis for Shoes.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price  actual_price
count  1429.000000    1872.000000      1872.00000   1872.000000
mean      3.770679     252.386218       901.68687   1899.282489
std       0.702078    1649.066677      1242.79835   2114.102015
min       1.000000       0.000000         0.00000      0.000000
25%       3.400000       1.000000       349.00000    799.000000
50%       3.800000       7.000000       550.50000   1066.000000
75%       4.100000      46.000000       899.67250   1999.000000
max       5.000000   46986.000000     12154.00000  19999.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0     0.00000
main_category                0     0.00000
sub_category                 0     0.00000
image                        0     0.00000
link                         0     0

  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Speakers.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price   actual_price
count  3853.000000    9600.000000     9600.000000    9600.000000
mean      3.677135     637.500625     2591.245285    4370.693580
std       0.939584    5775.647028     8370.519472   12410.207761
min       1.000000       0.000000        0.000000       0.000000
25%       3.200000       0.000000      698.750000    1599.000000
50%       3.800000       0.000000      999.000000    1599.000000
75%       4.300000       6.000000      999.000000    2099.250000
max       5.000000  150099.000000   183980.000000  329999.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings              

  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Sports Shoes.csv ===
=== Basic Statistics ===
Basic Statistics:
            ratings  no_of_ratings  discount_price  actual_price
count  10971.000000   19200.000000    19200.000000  19200.000000
mean       3.718130     155.808333     1523.859749   3758.168953
std        0.825148    1468.764319     2190.660601   5699.255009
min        1.000000       0.000000        0.000000      0.000000
25%        3.400000       0.000000      499.000000    999.000000
50%        3.800000       1.000000      949.000000   1699.000000
75%        4.100000      22.000000     1619.000000   3999.000000
max        5.000000   86521.000000    66649.000000  66779.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings          

  correlation = df.corr()


=== Analysis for Sportswear.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price  actual_price
count  6437.000000    7371.000000     7371.000000   7371.000000
mean      4.110704     364.996880      918.900578  12485.000389
std       0.645866    1995.979502     2088.054529  11932.087111
min       1.000000       0.000000        0.000000      0.000000
25%       3.800000       3.000000        0.000000   2697.000000
50%       4.200000      23.000000        0.000000   9206.080000
75%       4.500000     136.000000     1168.500000  17996.400000
max       5.000000   71125.000000    26997.300000  63909.810000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings                    9

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Suitcases and Trolley Bags.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price   actual_price
count  424.000000    1152.000000     1152.000000    1152.000000
mean     3.831604      76.315972     4203.062630   10214.749731
std      0.988153     844.467083     6431.934906   12481.894880
min      1.000000       0.000000        0.000000       0.000000
25%      3.500000       0.000000      381.000000    2166.000000
50%      4.000000       0.000000     2137.840000    5999.000000
75%      4.500000       5.000000     5750.000000   14709.750000
max      5.000000   24733.000000   129032.000000  190578.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings     

  correlation = df.corr()
  correlation = df.corr()


=== Analysis for T-shirts and Polos.csv ===
=== Basic Statistics ===
Basic Statistics:
            ratings  no_of_ratings  discount_price  actual_price
count  12956.000000   19104.000000    19104.000000  19104.000000
mean       3.851829      74.646723      571.670442   1335.559333
std        0.784468     751.675312      392.931247    734.755285
min        1.000000       0.000000        0.000000      0.000000
25%        3.500000       0.000000      370.000000    899.000000
50%        3.900000       3.000000      499.000000   1199.000000
75%        4.300000      15.000000      699.000000   1599.000000
max        5.000000   60020.000000    10231.000000  15995.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings    

  correlation = df.corr()


Transformed data saved for T-shirts and Polos.csv.

=== Analysis for Televisions.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price  actual_price
count  584.000000    1104.000000    1.104000e+03  1.104000e+03
mean     4.047089    1189.620471    2.567412e+04  4.131827e+04
std      0.697625    5188.096279    5.705825e+04  8.070039e+04
min      1.000000       0.000000    0.000000e+00  0.000000e+00
25%      3.900000       0.000000    0.000000e+00  0.000000e+00
50%      4.200000       1.000000    9.999000e+03  1.999900e+04
75%      4.400000     138.000000    2.999900e+04  5.000000e+04
max      5.000000   47642.000000    1.249990e+06  1.594900e+06


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                       

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Toys and Games.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price  actual_price
count  876.000000     912.000000      912.000000    912.000000
mean     4.002397    1373.487939      681.959364   1319.165757
std      0.535766    6453.896796      917.977167   1453.514865
min      1.000000       0.000000        0.000000      0.000000
25%      3.800000      18.000000      258.000000    499.000000
50%      4.100000     122.000000      399.000000    999.000000
75%      4.300000     636.250000      699.000000   1499.000000
max      5.000000  157194.000000    11299.000000  16200.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings                     36   

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


Percentage of complete rows: 72.64%


=== Top 3 Products with the Highest Rating ===
Top 3 Products with the Highest Rating:
                                                  name  ratings
100  DARKZONE Clear Transparent, Durable Wind-Resis...      5.0
105  Synaty Waist Bag for Men & Women, Slim Fanny P...      5.0
166  Stolenband® Shoe Bag for Travel & Storage Trav...      5.0


=== Top 3 Products with the Most Ratings ===
Top 3 Products with the Most Ratings:
                                                   name  no_of_ratings
5     3M 1110 Ear Plugs Corded, Extra Soft, Reusable...        33642.0
1607                     Under Armour Adult Sports Mask        23957.0
0     GLUN Bolt Electronic Portable Fishing Hook Typ...        18665.0


=== Top 3 Products with the Most Ratings, Sorted ===
Top 3 Products with the Most Ratings, Sorted by Highest:
                                                   name  no_of_ratings
5     3M 1110 Ear Plugs Corded, Extra Soft, Reusable...        3364

  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()
  correlation = df.corr()


=== Analysis for Washing Machines.csv ===
=== Basic Statistics ===
Basic Statistics:
          ratings  no_of_ratings  discount_price   actual_price
count  633.000000    1440.000000     1440.000000    1440.000000
mean     3.981201     161.206944    10883.757271   16135.267931
std      0.806958     956.951010    15533.087389   24303.569386
min      1.000000       0.000000        0.000000       0.000000
25%      3.800000       0.000000        0.000000     578.750000
50%      4.200000       0.000000     3999.000000    9999.000000
75%      4.400000      12.000000    17409.250000   23922.500000
max      5.000000   24086.000000   230000.000000  600000.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings               

  correlation = df.corr()


Transformed data saved for Watches.csv.

=== Analysis for Western Wear.csv ===
=== Basic Statistics ===
Basic Statistics:
            ratings  no_of_ratings  discount_price  actual_price
count  15646.000000   19200.000000    19200.000000  19200.000000
mean       3.774102     115.528542      605.543257   1616.823514
std        0.722897     490.934025      464.262938   1031.958070
min        1.000000       0.000000        0.000000      0.000000
25%        3.400000       1.000000      349.000000    999.000000
50%        3.800000       8.000000      499.000000   1400.000000
75%        4.100000      48.000000      750.000000   1999.000000
max        5.000000   25809.000000    14098.000000  49999.000000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link               

  correlation = df.corr()


=== Analysis for Womens Fashion.csv ===
=== Basic Statistics ===
Basic Statistics:
           ratings  no_of_ratings  discount_price  actual_price
count  1873.000000    2112.000000     2112.000000   2112.000000
mean      3.909130     429.457860      811.657401   2187.363480
std       0.527195    2111.540106     2229.037946   3063.651103
min       1.000000       0.000000        0.000000      0.000000
25%       3.600000       7.000000      289.000000    999.000000
50%       3.900000      51.000000      469.000000   1499.000000
75%       4.200000     288.000000      799.000000   2499.000000
max       5.000000   67255.000000    64111.000000  70522.100000


=== Missing Values Analysis ===
Missing Values Analysis:
                Missing Values  Percentage
name                         0    0.000000
main_category                0    0.000000
sub_category                 0    0.000000
image                        0    0.000000
link                         0    0.000000
ratings                 

  correlation = df.corr()
  correlation = df.corr()
