In [8]:
"""
Lead Source Data Exploration
Initial exploration to understand the dataset before building visualizations
"""

import pandas as pd
import numpy as np

# Read the cleaned dataset
df = pd.read_csv('CleanSalesforceData.csv')

print("LEAD SOURCE ANALYSIS")
print("="*60)

# Check lead source distribution
lead_source_counts = df['lead_source'].value_counts(dropna=False)
print("\n Distribution:")
print(lead_source_counts)
print(f"\n Unique lead sources: {df['lead_source'].nunique()}")
print(f"Records with missing lead source: {df['lead_source'].isna().sum()}")

print("\n" + "="*60)
print("OPPORTUNITY ANALYSIS")
print("="*60)

# Check how many have opportunities
has_opportunity = df['opportunity_amount'].notna().sum()
no_opportunity = df['opportunity_amount'].isna().sum()
print(f"Records with opportunity amount: {has_opportunity}")
print(f"Records without opportunity amount: {no_opportunity}")
print(f"Overall opportunity rate: {has_opportunity/len(df)*100:.1f}%")

# Opportunity value statistics
print(f"\nOpportunity Value Statistics:")
print(f"Min: ${df['opportunity_amount'].min()}")
print(f"Max: ${df['opportunity_amount'].max()}")
print(f"Mean: ${df['opportunity_amount'].mean()}")
print(f"Median: ${df['opportunity_amount'].median()}")

print("\n" + "="*60)
print("LEAD SOURCE OPPORTUNITY RATES")
print("="*60)

# Calculate opportunity rate by lead source
for source in df[df['lead_source'].notna()]['lead_source'].unique():
    source_data = df[df['lead_source'] == source]
    opp_rate = source_data['opportunity_amount'].notna().sum() / len(source_data) * 100
    print(f"{source:20s}: {opp_rate:5.1f}% ({len(source_data)} total leads)")

print("\n" + "="*60)
print("KEY FINDINGS")
print("="*60)
print("- Email Campaign is the largest lead source")
print("- Social Media and Trade Show are close seconds")
print("- Opportunity rates vary significantly by source")
print("- Average deal values need deeper analysis")

LEAD SOURCE ANALYSIS

 Distribution:
lead_source
Email Campaign    117
Social Media      100
Trade Show         98
Referral           84
NaN                40
Phone Inquiry      22
Web                21
Partner            18
Name: count, dtype: int64

 Unique lead sources: 7
Records with missing lead source: 40

OPPORTUNITY ANALYSIS
Records with opportunity amount: 433
Records without opportunity amount: 67
Overall opportunity rate: 86.6%

Opportunity Value Statistics:
Min: $0.0
Max: $50000.0
Mean: $22344.110854503466
Median: $10000.0

LEAD SOURCE OPPORTUNITY RATES
Trade Show          :  90.8% (98 total leads)
Social Media        :  83.0% (100 total leads)
Email Campaign      :  84.6% (117 total leads)
Web                 :  90.5% (21 total leads)
Referral            :  90.5% (84 total leads)
Phone Inquiry       :  81.8% (22 total leads)
Partner             :  83.3% (18 total leads)

KEY FINDINGS
- Email Campaign is the largest lead source
- Social Media and Trade Show are close second