In [None]:
# EDA.ipynb
# Exploratory Data Analysis for Multi-Product Recommendation System

# Step 1: Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

# Step 2: Load processed datasets
def load_json_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return pd.json_normalize(data)

# Load datasets
books_df = load_json_data('../data/processed/processed_books.json')
songs_df = load_json_data('../data/processed/processed_songs.json')
movies_df = load_json_data('../data/processed/processed_movies.json')
clothes_df = load_json_data('../data/processed/processed_clothes.json')

# Step 3: Overview of the data
# Check the first few rows of each dataset
print("Books Dataset:")
display(books_df.head())

print("Songs Dataset:")
display(songs_df.head())

print("Movies Dataset:")
display(movies_df.head())

print("Clothes Dataset:")
display(clothes_df.head())

# Step 4: Data Summary Statistics
# Generate summary statistics for each dataset
print("Summary Statistics for Books Dataset:")
display(books_df.describe())

print("Summary Statistics for Songs Dataset:")
display(songs_df.describe())

print("Summary Statistics for Movies Dataset:")
display(movies_df.describe())

print("Summary Statistics for Clothes Dataset:")
display(clothes_df.describe())

# Step 5: Visualize the distribution of ratings for all datasets
def plot_rating_distribution(df, title, rating_column='average_rating'):
    plt.figure(figsize=(10, 6))
    sns.histplot(df[rating_column], bins=20, kde=True)
    plt.title(f'Distribution of {rating_column} in {title}')
    plt.xlabel(f'{rating_column}')
    plt.ylabel('Frequency')
    plt.show()

# Rating Distribution
plot_rating_distribution(books_df, 'Books')
plot_rating_distribution(songs_df, 'Songs')
plot_rating_distribution(movies_df, 'Movies')
plot_rating_distribution(clothes_df, 'Clothes')

# Step 6: Most Popular Products by Ratings Count
def top_rated_products(df, title, rating_column='average_rating', count_column='ratings_count', top_n=10):
    top_products = df.sort_values(by=[rating_column, count_column], ascending=False).head(top_n)
    plt.figure(figsize=(12, 6))
    sns.barplot(x=top_products['title'], y=top_products[rating_column])
    plt.xticks(rotation=90)
    plt.title(f'Top {top_n} {title} by {rating_column}')
    plt.ylabel(f'{rating_column}')
    plt.show()

# Top Rated Products
top_rated_products(books_df, 'Books')
top_rated_products(songs_df, 'Songs')
top_rated_products(movies_df, 'Movies')
top_rated_products(clothes_df, 'Clothes')

# Step 7: Correlation Analysis
# Correlation between ratings and other numerical features (e.g., duration, release_year)
def plot_correlation_heatmap(df, title):
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f'Correlation Matrix for {title}')
    plt.show()

# Correlation Heatmaps
plot_correlation_heatmap(books_df, 'Books')
plot_correlation_heatmap(songs_df, 'Songs')
plot_correlation_heatmap(movies_df, 'Movies')
plot_correlation_heatmap(clothes_df, 'Clothes')

# Step 8: Genre Analysis
# Visualize the number of products per genre/category
def plot_genre_distribution(df, genre_column, title):
    plt.figure(figsize=(12, 6))
    sns.countplot(x=genre_column, data=df, order=df[genre_column].value_counts().index)
    plt.xticks(rotation=90)
    plt.title(f'Genre Distribution in {title}')
    plt.show()

# Plot Genre/Category Distribution
plot_genre_distribution(books_df, 'genre', 'Books')
plot_genre_distribution(songs_df, 'genre', 'Songs')
plot_genre_distribution(movies_df, 'genre', 'Movies')
plot_genre_distribution(clothes_df, 'category', 'Clothes')

# Step 9: Yearly Trends (e.g., number of products released each year)
def plot_release_year_trend(df, year_column, title):
    plt.figure(figsize=(10, 6))
    sns.histplot(df[year_column], bins=30, kde=True)
    plt.title(f'Trend of {title} Releases Over Time')
    plt.xlabel(f'{year_column}')
    plt.ylabel('Number of Products Released')
    plt.show()

# Plot Trends for Release Year
plot_release_year_trend(books_df, 'release_year', 'Books')
plot_release_year_trend(songs_df, 'release_year', 'Songs')
plot_release_year_trend(movies_df, 'release_year', 'Movies')

# Step 10: Conclusion and Findings
# After completing the analysis, you can document and share findings such as:
# - Most popular genres
# - Trends over time
# - Correlations between ratings and other features

