# Product Analyst Project: Advanced EDA and Feature Analysis

## 1. Import Required Libraries and Load Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv("../data/Product_Feature_Usage_Complex.csv")
df.head()

## 2. Basic Data Exploration

In [None]:
print("Data Shape:", df.shape)
print("Data Types:\n", df.dtypes)
print("Missing Values:\n", df.isnull().sum())
df.describe(include='all')


## 3. Data Cleaning & Feature Engineering

In [None]:
# Convert datetime
df['session_date'] = pd.to_datetime(df['session_date'])

# Add new temporal features
df['month'] = df['session_date'].dt.month
df['day'] = df['session_date'].dt.day
df['hour'] = df['session_date'].dt.hour
df['weekday'] = df['session_date'].dt.day_name()

# Remove duplicates
df.drop_duplicates(inplace=True)

# Remove outliers (top 1% session duration)
q_hi = df['session_duration_min'].quantile(0.99)
df = df[df['session_duration_min'] < q_hi]

# Sanity check on data
df.reset_index(drop=True, inplace=True)
df.head()


## 4. Session Duration Analysis

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(df['session_duration_min'], bins=40, kde=True)
plt.title("Session Duration Distribution")
plt.xlabel("Minutes")
plt.ylabel("Frequency")
plt.show()

# Boxplot by Feature
plt.figure(figsize=(14, 6))
sns.boxplot(x='feature_used', y='session_duration_min', data=df)
plt.xticks(rotation=45)
plt.title("Session Duration by Feature Used")
plt.show()


## 5. Conversion Analysis

In [None]:
conversion_rate = df.groupby('feature_used')['conversion_flag'].mean().sort_values(ascending=False)
conversion_rate.plot(kind='bar', figsize=(12, 6), color='green')
plt.title("Conversion Rate by Feature")
plt.ylabel("Conversion Rate")
plt.xlabel("Feature")
plt.xticks(rotation=45)
plt.show()


## 6. A/B Testing Impact on Conversions

In [None]:
ab_group_stats = df.groupby('ab_test_group')['conversion_flag'].agg(['mean', 'count'])
print(ab_group_stats)

sns.barplot(x='ab_test_group', y='conversion_flag', data=df)
plt.title("A/B Test Group Conversion Rate")
plt.ylabel("Conversion Rate")
plt.show()


## 7. Platform and Device Analysis

In [None]:
sns.countplot(data=df, x='platform', hue='device_type')
plt.title("Platform vs Device Usage")
plt.show()

# Average duration by platform
df.groupby('platform')['session_duration_min'].mean().plot(kind='bar', title="Avg. Session Duration by Platform")
plt.ylabel("Minutes")
plt.show()


## 8. User Behavior: Active Days, Repeats, Retention

In [None]:
user_activity = df.groupby('user_id')['session_date'].nunique()
plt.hist(user_activity, bins=30)
plt.title("User Active Days Distribution")
plt.xlabel("Number of Active Days")
plt.ylabel("Number of Users")
plt.show()


## 9. Loyalty Tier vs Satisfaction & Conversion

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(data=df, x='loyalty_tier', y='satisfaction_score')
plt.title("Satisfaction Score by Loyalty Tier")
plt.show()

conversion_by_tier = df.groupby('loyalty_tier')['conversion_flag'].mean()
conversion_by_tier.plot(kind='bar', title="Conversion Rate by Loyalty Tier")
plt.ylabel("Conversion Rate")
plt.show()


## 10. Save Cleaned Dataset

In [None]:
df.to_csv("../data/Cleaned_Product_Feature_Usage.csv", index=False)
print("Cleaned dataset saved.")
