In [2]:
# 📦 Step 1: Import necessary libraries
import pandas as pd

# 📁 Step 2: Load the dataset
file_path = 'purchase_data_exe.csv'  # Update if your filename is different
df = pd.read_csv(file_path)

# 🔍 Step 3: Preview the dataset
print("🔹 First 5 Rows:")
print(df.head())

# 📊 Step 4: Basic information
print("\n🔹 Dataset Shape:", df.shape)
print("🔹 Column Names:", df.columns.tolist())
print("\n🔹 Data Types:")
print(df.dtypes)

# 🔎 Step 5: Missing Values Check
print("\n🔹 Missing Values in Each Column:")
print(df.isnull().sum())

# 🔁 Step 6: Check for duplicates
duplicates = df.duplicated().sum()
print(f"\n🔹 Number of duplicate rows: {duplicates}")

# 🧼 Step 7: Drop duplicates (optional, create a cleaned version)
df_cleaned = df.drop_duplicates()

# 🗓 Step 8: Convert date/time column (if applicable)
date_columns = ['order_date', 'event_time', 'timestamp']
for col in date_columns:
    if col in df_cleaned.columns:
        try:
            df_cleaned[col] = pd.to_datetime(df_cleaned[col])
            print(f"✅ Converted '{col}' to datetime.")
        except:
            print(f"⚠️ Couldn't convert '{col}' to datetime.")

# 🎯 Step 9: Identify important columns for analysis
important_fields = ['order_id', 'user_id', 'product_id', 'price', 'quantity', 'city', 'category']
print("\n🔹 Important Columns Present:")
for col in important_fields:
    if col in df_cleaned.columns:
        print(f"✔ {col}")
    else:
        print(f"✘ {col} (not found)")

# 🌍 Step 10: Geographical information (if any)
region_columns = ['city', 'state', 'country']
print("\n🔹 Regional Columns Found:")
for col in region_columns:
    if col in df_cleaned.columns:
        print(f"✔ {col}")

# 📝 Step 11: Final Summary
print("\n📋 Final Summary of Dataset:")
print(f"- Total Rows: {df_cleaned.shape[0]}")
print(f"- Total Columns: {df_cleaned.shape[1]}")
print(f"- Duplicate rows removed: {duplicates}")
print("- Missing value count by column:\n", df_cleaned.isnull().sum()[df_cleaned.isnull().sum() > 0])

# 💾 Step 12: Save the cleaned version for future use
df_cleaned.to_csv("ecommerce_data_cleaned.csv", index=False)
print("\n✅ Cleaned dataset saved as 'ecommerce_data_cleaned.csv'")


🔹 First 5 Rows:
         date  customer_id  product_category payment_method  value [USD]  \
0  20/11/2018        37077               505         credit        49.53   
1  20/11/2018        59173               509         paypal        50.61   
2  20/11/2018        41066               507         credit        85.99   
3  20/11/2018        50741               506         credit        34.60   
4  20/11/2018        53639               515         paypal       266.27   

   time_on_site [Minutes]  clicks_in_site  Unnamed: 7  
0                    12.0               8         NaN  
1                    25.9               8         NaN  
2                    34.9              11         NaN  
3                    16.5               9         NaN  
4                    43.1              30         NaN  

🔹 Dataset Shape: (24999, 8)
🔹 Column Names: ['date', 'customer_id', 'product_category', 'payment_method', 'value [USD]', 'time_on_site [Minutes]', 'clicks_in_site', 'Unnamed: 7']

🔹 Data Typ