In [3]:
import pandas as pd
import numpy as np
import zipfile
from textblob import TextBlob

# Path to the zip file
zip_file_path = r'C:\Users\chatu\Downloads\archive.zip'

# Extracting the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('extracted_files')

# Assuming the extracted CSV file is named 'amazon.csv'
csv_file_path = 'extracted_files/amazon.csv'

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Data Cleaning: Removing non-numeric characters from price columns and converting them to numeric
df['discounted_price'] = df['discounted_price'].str.replace('₹', '').str.replace(',', '').astype(float)
df['actual_price'] = df['actual_price'].str.replace('₹', '').str.replace(',', '').astype(float)
df['discount_percentage'] = df['discount_percentage'].str.replace('%', '').astype(float)

# Handling invalid rating entries
df['rating'] = df['rating'].replace('|', np.nan).astype(float)

# Removing commas from rating_count and converting to numeric
# Fill NaNs with 0 before conversion
df['rating_count'] = df['rating_count'].str.replace(',', '').fillna(0).astype(int)

# Checking for missing values
missing_values = df.isnull().sum()
print("Missing Values Summary:")
print(missing_values)

# Filling missing ratings with the mean rating
df['rating'] = df['rating'].fillna(df['rating'].mean())

# Descriptive Statistics
print("\nDescriptive Statistics for Numeric Columns:")
print(df[['discounted_price', 'actual_price', 'discount_percentage', 'rating', 'rating_count']].describe())

# Sentiment Analysis on the review_content column
df['review_sentiment'] = df['review_content'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Sentiment Statistics
print("\nReview Sentiment Statistics:")
print(df['review_sentiment'].describe())

# Correlation Analysis
correlation_matrix = df[['discounted_price', 'actual_price', 'discount_percentage', 'rating', 'rating_count', 'review_sentiment']].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Category Analysis: Distribution of products across categories
category_counts = df['category'].value_counts()
print("\nProduct Distribution Across Categories:")
print(category_counts)

# Save the cleaned and processed data to a new CSV file
df.to_csv('cleaned_amazon_data.csv', index=False)
print("\nCleaned data has been saved to 'cleaned_amazon_data.csv'.")


Missing Values Summary:
product_id             0
product_name           0
category               0
discounted_price       0
actual_price           0
discount_percentage    0
rating                 1
rating_count           0
about_product          0
user_id                0
user_name              0
review_id              0
review_title           0
review_content         0
img_link               0
product_link           0
dtype: int64

Descriptive Statistics for Numeric Columns:
       discounted_price   actual_price  discount_percentage       rating  \
count       1465.000000    1465.000000          1465.000000  1465.000000   
mean        3125.310874    5444.990635            47.691468     4.096585   
std         6944.304394   10874.826864            21.635905     0.291574   
min           39.000000      39.000000             0.000000     2.000000   
25%          325.000000     800.000000            32.000000     4.000000   
50%          799.000000    1650.000000            50.000000   