# Exploratory Data Analysis

Notebook-driven EDA for the sampled 50k electronics reviews dataset.

## Cell 1 â€” Imports

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import numpy as np
import re
import os

plt.style.use("ggplot")
sns.set_context("talk")

ModuleNotFoundError: No module named 'seaborn'

## Cell 2 â€” Load the Sampled 50k Reviews

In [None]:
data_path = "../artifacts/raw_data/electronics_sample_50k.json"

if not os.path.exists(data_path):
    raise FileNotFoundError(
        f"Could not find dataset at {data_path}. Please ensure the sampled file exists."
    )

records = []
with open(data_path, "r") as f:
    for line in f:
        records.append(json.loads(line))

df = pd.DataFrame(records)
df.head()

ðŸ‘‰ This loads your 50k JSON-lines dataset into a DataFrame.

## Cell 3 â€” Basic Dataset Information

In [None]:
df.info()

## Cell 4 â€” Check Missing Values

In [None]:
df.isnull().sum()

## Cell 5 â€” Review Length Analysis

In [None]:
df['review_length'] = df['reviewText'].astype(str).apply(len)

plt.figure(figsize=(10, 5))
sns.histplot(df['review_length'], bins=50)
plt.title("Review Length Distribution")
plt.xlabel("Length of Review (characters)")
plt.ylabel("Frequency")
plt.show()

## Cell 6 â€” WordCloud of All Review Text

In [None]:
text = " ".join(df['reviewText'].astype(str).tolist())

wc = WordCloud(width=1200, height=600, background_color="white").generate(text)

plt.figure(figsize=(14, 7))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud of Electronics Reviews")
plt.show()

## Cell 7 â€” Ratings Distribution (if available)

In [None]:
if 'overall' in df.columns:
    plt.figure(figsize=(7, 4))
    sns.countplot(x=df['overall'], palette="viridis")
    plt.title("Ratings Distribution")
    plt.xlabel("Rating (overall)")
    plt.ylabel("Count")
    plt.show()
else:
    print("Ratings column 'overall' not found in dataset.")

## Cell 8 â€” Most Common Words

In [None]:
def clean_text(text):
    text = str(text).lower()
    return re.sub(r"[^a-z ]", " ", text)

corpus = " ".join(df['reviewText'].astype(str).apply(clean_text))

words = corpus.split()
word_counts = Counter(words)

word_counts.most_common(20)

## Cell 9 â€” Word Count Distribution

In [None]:
df['word_count'] = df['reviewText'].astype(str).apply(lambda x: len(x.split()))

plt.figure(figsize=(10, 5))
sns.histplot(df['word_count'], bins=50)
plt.title("Word Count Distribution")
plt.xlabel("Words in Review")
plt.ylabel("Count")
plt.show()

## Cell 10 â€” Summary of EDA Insights

In [None]:
print("Total number of reviews:", len(df))
print("Average review length (chars):", df['review_length'].mean())
print("Average word count:", df['word_count'].mean())
print("Median review length:", df['review_length'].median())

if 'overall' in df.columns:
    print("\nRating distribution:")
    print(df['overall'].value_counts())