<a href="https://colab.research.google.com/github/Deland78/KP_Lead_Scoring_Colab/blob/main/KP_Lead_Scoring_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KP Lead Scoring Model\n
Custom lead scoring model built for education leads requiring sales follow-up.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import joblib

# Set display options
pd.set_option('display.max_columns', None)

SyntaxError: unexpected character after line continuation character (1934094127.py, line 1)

In [None]:
# Load the cleaned data
df = pd.read_csv('KP_Lead_Scoring_Data_Cleaned.csv')

print(f"Dataset shape: {df.shape}")
print(f"Conversion rate: {(df['Converted'].sum() / len(df) * 100):.2f}%")
print(f"Total conversions: {df['Converted'].sum():,}")

In [None]:
def create_lead_scoring_features(df):
    """Create engineered features for lead scoring"""
    df_features = df.copy()

    # 1. INTENT URGENCY SCORE
    intent_scores = {
        'Ready to enrol': 10,
        'Ready to enroll': 10,
        'Readytoenrol': 10,
        'Looking for more information': 6,
        'Researching options': 3,
        'Unknown': 2,
        'unsure': 2
    }

    df_features['Intent_Score'] = df_features['Intent To Enroll'].fillna('Unknown').map(
        lambda x: intent_scores.get(x, 2)
    )

    # 2. TIMING URGENCY SCORE
    timing_scores = {
        'within 3 months': 8,
        'within 6 months': 5,
        'within 12 months': 2,
        '12 months plus': 1,
        'unsure': 3
    }

    df_features['Timing_Score'] = df_features['When Like To Begin Studying'].fillna('unsure').map(
        lambda x: timing_scores.get(x, 3)
    )

    # 3. CHANNEL PERFORMANCE SCORE
    channel_scores = {
        'Unknown': 8,
        'Referral': 7,
        'Traditional': 6,
        'Corporate': 5,
        'SEO': 3,
        'Affinity': 2,
        'Email List': 1,
        'PPI': 1,
        'PPC': 3
    }

    df_features['Channel_Score'] = df_features['Channel'].map(
        lambda x: channel_scores.get(x, 3)
    )

    # 4. RETURNING CONTACT FEATURES
    df_features['Is_Returning_Contact'] = (df_features['Opportunity Count'] > 1).astype(int)

    # 5. EDUCATION LEVEL SCORE
    education_scores = {
        'Graduate / Masters Degree': 6,
        'Postgraduate Diploma': 5,
        'Bachelors Degree': 4,
        'Diploma': 3,
        'Year 12': 2,
        'Unknown': 3
    }

    df_features['Education_Score'] = df_features['Highest Level Of Education'].fillna('Unknown').map(
        lambda x: education_scores.get(x, 3)
    )

    return df_features

# Apply feature engineering
df_features = create_lead_scoring_features(df)
print("Feature engineering complete!")

In [None]:
# Select features and train model
model_features = ['Intent_Score', 'Timing_Score', 'Channel_Score', 'Is_Returning_Contact', 'Education_Score']

X = df_features[model_features]
y = df_features['Converted']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
model.fit(X_train_scaled, y_train)

print("Model training complete!")

In [None]:
# Evaluate model
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

print("=== MODEL PERFORMANCE ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
print("\n=== FEATURE IMPORTANCE ===")
importance_df = pd.DataFrame({
    'Feature': model_features,
    'Coefficient': model.coef_[0],
    'Abs_Importance': np.abs(model.coef_[0])
}).sort_values('Abs_Importance', ascending=False)

print(importance_df)

In [None]:
def score_lead(intent_to_enroll, when_like_to_begin, channel, highest_education, opportunity_count):
    """Score a new lead (0-100 scale)"""

    intent_scores = {'Ready to enrol': 10, 'Ready to enroll': 10, 'Looking for more information': 6, 'Researching options': 3, 'Unknown': 2, 'unsure': 2}
    timing_scores = {'within 3 months': 8, 'within 6 months': 5, 'within 12 months': 2, '12 months plus': 1, 'unsure': 3}
    channel_scores = {'Unknown': 8, 'Referral': 7, 'Traditional': 6, 'Corporate': 5, 'SEO': 3, 'Affinity': 2, 'Email List': 1, 'PPI': 1, 'PPC': 3}
    education_scores = {'Graduate / Masters Degree': 6, 'Postgraduate Diploma': 5, 'Bachelors Degree': 4, 'Diploma': 3, 'Year 12': 2, 'Unknown': 3}

    # Calculate scores
    intent_score = intent_scores.get(intent_to_enroll, 2)
    timing_score = timing_scores.get(when_like_to_begin, 3)
    channel_score = channel_scores.get(channel, 3)
    education_score = education_scores.get(highest_education, 3)
    is_returning = 1 if opportunity_count > 1 else 0

    # Create feature array
    features = np.array([[intent_score, timing_score, channel_score, is_returning, education_score]])

    # Scale and predict
    features_scaled = scaler.transform(features)
    probability = model.predict_proba(features_scaled)[0, 1]

    return (probability * 100).round(2)

print("Lead scoring function ready!")

# Test examples
print("\n=== SCORING EXAMPLES ===")
print(f"High priority: {score_lead('Ready to enrol', 'within 3 months', 'Referral', 'Bachelors Degree', 1)}")
print(f"Medium priority: {score_lead('Researching options', 'within 6 months', 'SEO', 'Diploma', 1)}")
print(f"Low priority: {score_lead('Researching options', 'within 12 months', 'Email List', 'Year 12', 1)}")

In [None]:
import pandas as pd
import numpy as np

# Load the cleaned data
df = pd.read_csv('KP_Lead_Scoring_Data_Cleaned.csv')

print("=== DATASET OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"Conversion rate: {df['Converted'].mean():.4f}")

print("\n=== COLUMN SUMMARY ===")
for col in df.columns:
    print(f"\n--- {col} ---")
    print(f"Data type: {df[col].dtype}")
    print(f"Missing values: {df[col].isnull().sum()} ({df[col].isnull().sum()/len(df)*100:.1f}%)")
    print(f"Unique values: {df[col].nunique()}")