# Exploratory Data Analysis

## Load and Inspect the Dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [None]:
dataset = pd.read_csv("data/TechCorner_Sales_update.csv")

In [None]:
dataset.head(10)

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
# Clean column names
dataset.columns = dataset.columns.str.strip().str.replace(' ', '_').str.replace('?', '').str.lower()

# Check cleaned column names
dataset.columns

In [None]:
dataset.info()

In [None]:
dataset.head()

## Geographic distribution of customers (local vs. non-local)

Note that:
- Local = "Rangamati Sadar" or "Inside Rangamati"
- Non-Local = "Outside Rangamati"

In [None]:
# Distinct value in cus._location
dataset['cus._location'].unique()

In [None]:
# Count distinct values
dataset['cus._location'].nunique()

In [None]:
dataset['cus._location'].value_counts()

In [None]:
# Define a new column for local vs. non-local
dataset['is_local'] = dataset['cus._location'].apply(
    lambda x: 'local' if 'Rangamati' in x and 'Outside' not in x else 'non-local'
)


In [None]:
# Count distribution
location_dist = dataset['is_local'].value_counts()
print(location_dist)

In [None]:
location_dist.plot(kind='bar', title='Customer Geographic Distribution (Local vs. Non-Local)', color=['green', 'red'])
plt.ylabel('Number of Customers')
plt.xlabel('Customer Type')
plt.tight_layout()
plt.show()

## New vs. Returning Customers

In [None]:
# Get value counts and convert to DataFrame
value_new_or_returning_df = dataset['did_he/she_buy_any_mobile_before'].value_counts().reset_index()

# Rename columns for clarity
value_new_or_returning_df.columns = ['New / Returning', 'Count']

value_new_or_returning_df

## Facebook-origin vs. walk-in

In [None]:
value_facebook_origin_df = dataset['does_he/she_come_from_facebook_page'].value_counts().reset_index()

value_facebook_origin_df.columns = ['Facebook-origin', 'Count']

value_facebook_origin_df

## Referrals

In [None]:
value_heard_of_shop_df = dataset['did_he/she_hear_of_our_shop_before'].value_counts().reset_index()

value_heard_of_shop_df.columns = ['Referrals', 'Count']

value_heard_of_shop_df

## Facebook Followers

In [None]:
value_facebook_followers_df = dataset['does_he/she_followed_our_page'].value_counts().reset_index()

value_facebook_followers_df.columns = ['Facebook-origin', 'Count']

value_facebook_followers_df

### Visualization

In [None]:
# Plot function
def plot_behavior(col, title):
    plt.figure(figsize=(6,4))
    sns.countplot(data=dataset, x=col, hue=col, palette='Set2', legend=False)
    plt.title(title)
    plt.ylabel('Number of Customers')
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.show()


In [None]:
plot_behavior('did_he/she_buy_any_mobile_before', 'New vs. Returning Customers')
plot_behavior('does_he/she_come_from_facebook_page', 'Facebook-origin vs. walk-in')
plot_behavior('did_he/she_hear_of_our_shop_before', 'Referrals')
plot_behavior('does_he/she_followed_our_page', 'Followed Our Facebook Page')

## Analyze Mobile Brand Preferences and Pricing Trends

### Top Selling Mobile Brands

In [None]:
# Top 10 mobile brands
top_mobiles = dataset['mobile_name'].value_counts().head(10)

plt.figure(figsize=(10, 5))
sns.barplot(x=top_mobiles.index, y=top_mobiles.values, color='steelblue')
plt.title('Top 10 Most Sold Mobile Brands')
plt.xlabel('Mobile Brand')
plt.ylabel('Number of Sales')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(top_mobiles.index, top_mobiles.values, marker='o', linestyle='-', color='teal')
plt.title('Top 10 Most Sold Mobile Brands - Line Chart')
plt.xlabel('Mobile Brand')
plt.ylabel('Number of Sales')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
plt.pie(top_mobiles.values, labels=top_mobiles.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('viridis', len(top_mobiles)))
plt.title('Top 10 Most Sold Mobile Brands - Pie Chart')
plt.axis('equal')  # Makes the pie chart a circle
plt.tight_layout()
plt.show()

### Average Sell Price by Mobile Brand

In [None]:
# Group by mobile and compute mean price
avg_price_per_mobile = dataset.groupby('mobile_name')['sell_price'].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x=avg_price_per_mobile.head(10).index, y=avg_price_per_mobile.head(10).values, color='steelblue')
plt.title('Top 10 Most Expensive Mobile Brands (on Average)')
plt.xlabel('Mobile Brand')
plt.ylabel('Average Sell Price')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
top_10_avg_price = avg_price_per_mobile.head(10)

plt.figure(figsize=(12, 6))
plt.plot(top_10_avg_price.index, top_10_avg_price.values, marker='o', linestyle='-', color='darkorange')
plt.title('Top 10 Most Expensive Mobile Brands (Average Price) - Line Chart')
plt.xlabel('Mobile Brand')
plt.ylabel('Average Sell Price')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 8))
plt.pie(top_10_avg_price.values, labels=top_10_avg_price.index, autopct='%1.1f%%',
        startangle=140, colors=sns.color_palette('magma', len(top_10_avg_price)))
plt.title('Top 10 Most Expensive Mobile Brands (Average Price) - Pie Chart')
plt.axis('equal')  # Ensures pie is a circle
plt.tight_layout()
plt.show()


### Price Distribution Overall

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(dataset['sell_price'], kde=True, bins=30, color='skyblue')
plt.title('Distribution of Mobile Sell Prices')
plt.xlabel('Sell Price')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 4))
sns.violinplot(x=dataset['sell_price'], color='plum')
plt.title('Violin Plot of Mobile Sell Prices')
plt.xlabel('Sell Price')
plt.tight_layout()
plt.show()

In [None]:
from statsmodels.distributions.empirical_distribution import ECDF

ecdf = ECDF(dataset['sell_price'])

plt.figure(figsize=(10, 5))
plt.plot(ecdf.x, ecdf.y, marker='.', linestyle='none', color='seagreen')
plt.title('ECDF of Mobile Sell Prices')
plt.xlabel('Sell Price')
plt.ylabel('Cumulative Probability')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Density Plot (KDE only, no histogram)
plt.figure(figsize=(10, 5))
sns.kdeplot(dataset['sell_price'], fill=True, color='skyblue')
plt.title('KDE of Mobile Sell Prices')
plt.xlabel('Sell Price')
plt.ylabel('Density')
plt.tight_layout()
plt.show()

### Price Trends Over Time

In [None]:
# Convert date to datetime if not already done
dataset['date'] = pd.to_datetime(dataset['date'], format='%d-%m-%Y')

# Resample and get average monthly price
monthly_trend = dataset.set_index('date').resample('ME')['sell_price'].mean()

plt.figure(figsize=(12, 6))
monthly_trend.plot()
plt.title('Average Sell Price Over Time (Monthly)')
plt.ylabel('Average Sell Price')
plt.xlabel('Date')
plt.tight_layout()
plt.grid(True)
plt.show()


### Brand Popularity by Location

In [None]:
top_brands = dataset['mobile_name'].value_counts().head(5).index.tolist()

plt.figure(figsize=(12, 6))
sns.countplot(data=dataset[dataset['mobile_name'].isin(top_brands)],
              x='is_local',
              hue='mobile_name',
              order=dataset['is_local'].value_counts().index[:10])
plt.title('Top 5 Mobile Brands by Customer Location')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## Handle Missing Values

In [None]:
# Check for missing values
missing_values = dataset.isnull().sum()
print("🔍 Missing Values:\n", missing_values)


In [None]:
# Check for placeholder strings like "NA", "Unknown"
for col in dataset.columns:
    if dataset[col].dtype == "object":
        print(f"\n🔎 Unique values in {col}:")
        print(dataset[col].unique())

## Encode Categorical Variables

In [None]:
# Identify Categorical Columns
cat_cols = dataset.select_dtypes(include='object').columns.tolist()
print("🧩 Categorical columns:", cat_cols)

### Label Encoding or One-Hot Encoding
- For binary columns: Label Encoding
- For multi-class columns: One-Hot Encoding

In [None]:
# Binary columns
binary_cols = [
    'does_he/she_come_from_facebook_page',
    'does_he/she_followed_our_page',
    'did_he/she_buy_any_mobile_before',
    'did_he/she_hear_of_our_shop_before',
    'gender',
    'is_local'
]

# Label encode binary columns
label_encoder = LabelEncoder()
for col in binary_cols:
    dataset[col] = label_encoder.fit_transform(dataset[col])

In [None]:
# One-hot encode multi-category column mobile_name
dataset = pd.get_dummies(dataset, columns=['mobile_name'], drop_first=True)

In [None]:
print("✅ Final Data Types:\n", dataset.dtypes)

In [None]:
print("\n✅ Encoded Data Sample:\n")
dataset.head()

In [None]:
# Save cleaned dataset to CSV

In [None]:
dataset.to_csv("data/cleaned_dataset.csv", index=False)