In [None]:
# EDA on Customer Sales Data

## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set(style="whitegrid")

In [None]:
## Step 1: Load the Data
try:
    # Attempt to load with UTF-8 encoding
    df = pd.read_csv("customer_sales.csv", encoding="utf-8")
except UnicodeDecodeError:
    # Fallback to ISO-8859-1 encoding
    df = pd.read_csv("customer_sales.csv", encoding="ISO-8859-1")

# Display the first few rows of the dataset
print("\nDataset Preview:")
print(df.head())

In [None]:
# Display basic information about the dataset
print("\nDataset Info:")
df.info()

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
## Step 2: Handle Missing Values
# Fill missing values in "Age" with the median value
df["Age"] = df["Age"].fillna(df["Age"].median())

# Confirm that missing values have been handled
print("\nMissing Values After Handling:")
print(df.isnull().sum())

In [None]:
## Step 3: Handle Data Types
# Convert "PurchaseAmount" to numeric (remove commas if present)
df["PurchaseAmount"] = pd.to_numeric(df["PurchaseAmount"].str.replace(",", ""))

# Convert "PurchaseDate" to datetime format
df["PurchaseDate"] = pd.to_datetime(df["PurchaseDate"])

# Display updated data types
print("\nData Types After Conversion:")
print(df.dtypes)

In [None]:
## Step 4: Identify and Handle Outliers
# Define a function to detect outliers using the IQR method
def detect_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return column[(column < lower_bound) | (column > upper_bound)]

# Find outliers in "Age" and "PurchaseAmount"
outliers_age = detect_outliers(df["Age"])
outliers_purchase = detect_outliers(df["PurchaseAmount"])

print("\nOutliers in Age:")
print(outliers_age)

print("\nOutliers in PurchaseAmount:")
print(outliers_purchase)

In [None]:
## Step 5: Plot Distributions
# Age Distribution
plt.figure(figsize=(8, 6))
df["Age"].hist(bins=10, color="skyblue", edgecolor="black")
plt.title("Age Distribution", fontsize=16)
plt.xlabel("Age", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.show()

# PurchaseAmount Distribution
plt.figure(figsize=(8, 6))
df["PurchaseAmount"].hist(bins=10, color="lightgreen", edgecolor="black")
plt.title("Purchase Amount Distribution", fontsize=16)
plt.xlabel("Purchase Amount", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.show()

# Customer Distribution by Country
plt.figure(figsize=(10, 6))
df["Country"].value_counts().plot(kind="bar", color="coral", edgecolor="black")
plt.title("Customer Distribution by Country", fontsize=16)
plt.xlabel("Country", fontsize=14)
plt.ylabel("Number of Customers", fontsize=14)
plt.xticks(rotation=45)
plt.show()

In [None]:
## Conclusion
print("Exploratory Data Analysis (EDA) Completed Successfully!")