# Exploratory Data Analysis: Before vs. After Cleaning

In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient

# --- Configuration ---
MONGO_URI = 'mongodb://localhost:27017/' # Connect from local machine
MONGO_DB = 'DataMiningProject'

client = MongoClient(MONGO_URI)
db = client[MONGO_DB]

# Load data into DataFrames
raw_df = pd.DataFrame(list(db['raw_dirty'].find()))
cleaned_df = pd.DataFrame(list(db['cleaned.events'].find({'value': {'$ne': None}}))) # Filter for events with a value

print(f"Loaded {len(raw_df)} raw records and {len(cleaned_df)} cleaned records.")

## 1. Missing Data Comparison [cite: 91]

Let's compare the count of missing values for key fields before and after cleaning.

In [ ]:
missing_raw = raw_df[['timestamp', 'value']].isnull().sum().rename('Raw Missing')
missing_cleaned = cleaned_df[['timestamp', 'value']].isnull().sum().rename('Cleaned Missing')

comparison_df = pd.concat([missing_raw, missing_cleaned], axis=1)
print("Missing Data Before vs. After:")
print(comparison_df)

## 2. Distribution of 'value' Field [cite: 88]

Here we visualize the distribution of the numeric `value` field. The raw data contains non-numeric types, so we must coerce them to numeric for plotting, which simulates what our cleaning script does.

In [ ]:
# Coerce raw 'value' to numeric, errors become NaN
raw_df['value_numeric'] = pd.to_numeric(raw_df['value'], errors='coerce')

fig, axes = plt.subplots(1, 2, figsize=(15, 6), sharey=True)
sns.set_style('whitegrid')

# Plot dirty data distribution
sns.histplot(raw_df['value_numeric'].dropna(), bins=30, kde=True, ax=axes[0], color='salmon')
axes[0].set_title('Distribution of Dirty `value` (Numeric Only)')
axes[0].set_xlabel('Value')

# Plot cleaned data distribution
sns.histplot(cleaned_df['value'], bins=30, kde=True, ax=axes[1], color='skyblue')
axes[1].set_title('Distribution of Cleaned `value`')
axes[1].set_xlabel('Value')

plt.suptitle('Comparison of Data Distributions Before and After Cleaning')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
# Save the plot to the reports/visuals folder
plt.savefig('../reports/visuals/distribution_comparison.png')
plt.show()

## 3. Categorical Field Standardization

Let's check the `platform` field to see how inconsistent casing and nulls were handled.

In [ ]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

raw_df['platform'].value_counts(dropna=False).plot(kind='bar', ax=axes[0], color='coral')
axes[0].set_title('Raw Platform Counts (with inconsistent casing & nulls)')
axes[0].tick_params(axis='x', rotation=45)

cleaned_df['platform'].value_counts(dropna=False).plot(kind='bar', ax=axes[1], color='teal')
axes[1].set_title('Cleaned Platform Counts (standardized)')
axes[1].tick_params(axis='x', rotation=45)

plt.suptitle('Platform Field Before and After Cleaning')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig('../reports/visuals/platform_standardization.png')
plt.show()