<a href="https://colab.research.google.com/github/Deland78/KP_Lead_Scoring_Colab/blob/main/KP_Lead_Scoring_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KP Lead Scoring Model\nCustom lead scoring model built for education leads requiring sales follow-up.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import joblib

# Set display options
pd.set_option('display.max_columns', None)

# Install & import
%pip install -q comet_ml

import comet_ml

# 1) Log in once to store your API key securely in the Colab VM
#    You'll be prompted for the key (you can find it in Comet -> top-right avatar -> Settings -> API key).
comet_ml.login()

# 2) Start an experiment and point it at your new project
#    Replace with your Comet workspace and desired project name (the project will be auto-created if it doesn't exist).
exp = comet_ml.start(
    workspace="david-eland",      # e.g., "david-eland"
    project_name="kp-lead-scoring-model" # any new or existing Comet project name
)

# (optional but nice) give the run a human-friendly name and tags
exp.set_name("baseline-regression-colab")
exp.add_tags(["colab", "lead-scoring", "baseline"])


[1;38;5;39mCOMET INFO:[0m An experiment with the same configuration options is already running and will be reused.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

# IMPORTANT: Replace file name below with the actual file in the Collab
# content folder (see folder icon in to the left).
selected_file = '/content/DE_Lead_Scoring_Cleaned (2).csv' # Replace with your filename


# Load the cleaned data using the selected file path
df = pd.read_csv(selected_file)

print(f"Dataset shape: {df.shape}")
print(f"Conversion rate: {(df['Converted'].sum() / len(df) * 100):.2f}%")
print(f"Total conversions: {df['Converted'].sum():,}")

# Apply feature engineering
df_features = create_lead_scoring_features(df)
print("Feature engineering complete!")

In [None]:
# Select features and train model
model_features = [
    'Intent_Score',
    'Timing_Score',
    'Channel_Score',
    'Is_Returning_Contact',
    'Education_Score']
    X = df_features[model_features]
    y = df_features['Converted']
    # Split data\n
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n\n
    # Scale features\nscaler = StandardScaler()\
    X_train_scaled = scaler.fit_transform(X_train)\nX_test_scaled = scaler.transform(X_test)\n\n#
Train model\nmodel = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)\nmodel.fit(X_train_scaled, y_train)\n\nprint(\"Model training complete!\")

In [None]:
#Select features and train model\nmodel_features = ['Intent_Score', 'Timing_score']
def create_lead_scoring_features(df):
    """Create engineered features for lead scoring"""
    df_features = df.copy()

    # 1. INTENT URGENCY SCORE
    intent_scores = {
        'Ready to enrol': 10,
        'Ready to enroll': 10,
        'Readytoenrol': 10,
        'Looking for more information': 6,
        'Researching options': 3,
        'Unknown': 2,
        'unsure': 2
    }

    df_features['Intent_Score'] = df_features['Intent To Enroll'].fillna('Unknown').map(
        lambda x: intent_scores.get(x, 2)
    )

    # 2. TIMING URGENCY SCORE
    timing_scores = {
        'within 3 months': 8,
        'within 6 months': 5,
        'within 12 months': 2,
        '12 months plus': 1,
        'unsure': 3
    }

    df_features['Timing_Score'] = df_features['When Like To Begin Studying'].fillna('unsure').map(
        lambda x: timing_scores.get(x, 3)
    )

    # 3. CHANNEL PERFORMANCE SCORE
    channel_scores = {
        'Unknown': 8,
        'Referral': 7,
        'Traditional': 6,
        'Corporate': 5,
        'SEO': 3,
        'Affinity': 2,
        'Email List': 1,
        'PPI': 1,
        'PPC': 3
    }

    df_features['Channel_Score'] = df_features['Channel'].map(
        lambda x: channel_scores.get(x, 3)
    )

    # 4. RETURNING CONTACT FEATURES
    df_features['Is_Returning_Contact'] = (df_features['Opportunity Count'] > 1).astype(int)

    # 5. EDUCATION LEVEL SCORE
    education_scores = {
        'Graduate / Masters Degree': 6,
        'Postgraduate Diploma': 5,
        'Bachelors Degree': 4,
        'Diploma': 3,
        'Year 12': 2,
        'Unknown': 3
    }

    df_features['Education_Score'] = df_features['Highest Level Of Education'].fillna('Unknown').map(
        lambda x: education_scores.get(x, 3)
    )

    return df_features

# Apply feature engineering
df_features = create_lead_scoring_features(df)

print("Feature engineering complete!") # Uncomment after applying feature engineering


In [None]:
# Evaluate model
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
print(\"=== MODEL PERFORMANCE ===\")
print(f\"Accuracy: {accuracy_score(y_test, y_pred):.4f}\")
print(f\"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}\")
print(\"\\nClassification Report:\")
print(classification_report(y_test, y_pred))

# Feature importance
print(\"\\n=== FEATURE IMPORTANCE ===\")
importance_df = pd.DataFrame({
        'Feature': model_features,
        'Coefficient': model.coef_[0],
        'Abs_Importance': np.abs(model.coef_[0])
}).sort_values('Abs_Importance', ascending=False)

print(importance_df)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import joblib

# Load the cleaned data
df = pd.read_csv('KP_Lead_Scoring_Data_Cleaned.csv')

# FOR COMET: Log hyperparameters

exp.log_parameter("dataset_path", "KP_Lead_Scoring_Data_Cleaned.csv")
exp.log_parameter("raw_rows", len(df))
exp.log_parameter("raw_cols", df.shape[1])

# FOR COMET: If 'Converted' exists now, log basic class balance (helps spot drift)
if "Converted" in df.columns:
    class_counts = df["Converted"].value_counts().to_dict()
    exp.log_metrics({
        "class_0_count_raw": class_counts.get(0, 0),
        "class_1_count_raw": class_counts.get(1, 0),
    })

def create_lead_scoring_features(df):
    """Create engineered features for lead scoring"""
    df_features = df.copy()

    # 1. INTENT URGENCY SCORE
    intent_scores = {
        'Ready to enrol': 10,
        'Ready to enroll': 10,
        'Readytoenrol': 10,
        'Looking for more information': 6,
        'Researching options': 3,
        'Unknown': 2,
        'unsure': 2
    }

    df_features['Intent_Score'] = df_features['Intent To Enroll'].fillna('Unknown').map(
        lambda x: intent_scores.get(x, 2)
    )

    # FOR COMET: Record the exact features being used, so results are reproducible/comparable.

    exp.log_parameter("feature_engineering_fn", "create_lead_scoring_features")


    # 2. TIMING URGENCY SCORE
    timing_scores = {
        'within 3 months': 8,
        'within 6 months': 5,
        'within 12 months': 2,
        '12 months plus': 1,
        'unsure': 3
    }

    df_features['Timing_Score'] = df_features['When Like To Begin Studying'].fillna('unsure').map(
        lambda x: timing_scores.get(x, 3)
    )

    # 3. CHANNEL PERFORMANCE SCORE
    channel_scores = {
        'Unknown': 8,
        'Referral': 7,
        'Traditional': 6,
        'Corporate': 5,
        'SEO': 3,
        'Affinity': 2,
        'Email List': 1,
        'PPI': 1,
        'PPC': 3
    }

    df_features['Channel_Score'] = df_features['Channel'].map(
        lambda x: channel_scores.get(x, 3)
    )

    # 4. RETURNING CONTACT FEATURES
    df_features['Is_Returning_Contact'] = (df_features['Opportunity Count'] > 1).astype(int)

    # 5. EDUCATION LEVEL SCORE
    education_scores = {
        'Graduate / Masters Degree': 6,
        'Postgraduate Diploma': 5,
        'Bachelors Degree': 4,
        'Diploma': 3,
        'Year 12': 2,
        'Unknown': 3
    }

    df_features['Education_Score'] = df_features['Highest Level Of Education'].fillna('Unknown').map(
        lambda x: education_scores.get(x, 3)
    )

    return df_features

# Apply feature engineering
df_features = create_lead_scoring_features(df)

# Select features and train model
model_features = ['Intent_Score', 'Timing_Score', 'Channel_Score', 'Is_Returning_Contact', 'Education_Score']

# FOR COMET: Pin the feature list and order (critical for consistent scaling & coefficients).

exp.log_parameter("model_features", ",".join(model_features))


X = df_features[model_features]
y = df_features['Converted']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# FOR COMET logging
exp.log_parameters({
    "test_size": 0.2,
    "random_state": 42,
    "stratify": True
})

exp.log_metrics({
    "n_train": len(X_train),
    "n_test": len(X_test)
})

# FOR COMET Optional : confirm class balance in the split
exp.log_metrics({
    "y_train_pos": int(y_train.sum()),
    "y_train_neg": int((y_train==0).sum()),
    "y_test_pos": int(y_test.sum()),
    "y_test_neg": int((y_test==0).sum()),
})

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# FOR COMET logging: Not strictly required, but useful to know preprocessing happened
exp.log_parameter("scaler", "StandardScaler")


# Train model
model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)

# FOR COMET Lock in model hyperparameters actually used in this run

exp.log_parameters({
    "model_type": "LogisticRegression",
    "class_weight": "balanced",
    "max_iter": 1000,
    "model_random_state": 42
})

model.fit(X_train_scaled, y_train)

# FOR COMET EVALUATION & LOGGING (ADD AFTER model.fit)
from sklearn.metrics import confusion_matrix

# Predictions & probabilities for metrics
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]

# Compute metrics you care about
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
report = classification_report(y_test, y_pred, output_dict=True)
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])

# Log scalar metrics to Comet
exp.log_metrics({
    "accuracy": accuracy,
    "auc": auc,
    "precision_neg": report["0"]["precision"],
    "recall_neg": report["0"]["recall"],
    "f1_neg": report["0"]["f1-score"],
    "precision_pos": report["1"]["precision"],
    "recall_pos": report["1"]["recall"],
    "f1_pos": report["1"]["f1-score"],
})

# Option A: log the confusion matrix as raw numbers (always safe)
exp.log_metrics({
    "cm_00": int(cm[0,0]),
    "cm_01": int(cm[0,1]),
    "cm_10": int(cm[1,0]),
    "cm_11": int(cm[1,1]),
})

# Option B (nice to have): log a plot of the confusion matrix
# (If you prefer a figure in the Comet UI)
import matplotlib.pyplot as plt
fig = plt.figure()
plt.imshow(cm, interpolation='nearest')
plt.title('Confusion Matrix')
plt.xticks([0,1], ['Neg (0)','Pos (1)'])
plt.yticks([0,1], ['Neg (0)','Pos (1)'])
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i,j], ha='center', va='center')
plt.xlabel('Predicted'); plt.ylabel('True')
exp.log_figure(figure_name="confusion_matrix", figure=fig)
plt.close(fig)

# FOR COMET: Save and upload artifacts (handy for re-use)
import joblib

joblib.dump(model, "lead_scoring_model.pkl")
joblib.dump(scaler, "lead_scoring_scaler.pkl")

exp.log_asset("lead_scoring_model.pkl")
exp.log_asset("lead_scoring_scaler.pkl")


def score_lead(intent_to_enroll, when_like_to_begin, channel, highest_education, opportunity_count):
    """Score a new lead (0-100 scale)"""

    intent_scores = {'Ready to enrol': 10, 'Ready to enroll': 10, 'Looking for more information': 6, 'Researching options': 3, 'Unknown': 2, 'unsure': 2}
    timing_scores = {'within 3 months': 8, 'within 6 months': 5, 'within 12 months': 2, '12 months plus': 1, 'unsure': 3}
    channel_scores = {'Unknown': 8, 'Referral': 7, 'Traditional': 6, 'Corporate': 5, 'SEO': 3, 'Affinity': 2, 'Email List': 1, 'PPI': 1, 'PPC': 3}
    education_scores = {'Graduate / Masters Degree': 6, 'Postgraduate Diploma': 5, 'Bachelors Degree': 4, 'Diploma': 3, 'Year 12': 2, 'Unknown': 3}

    # Calculate scores
    intent_score = intent_scores.get(intent_to_enroll, 2)
    timing_score = timing_scores.get(when_like_to_begin, 3)
    channel_score = channel_scores.get(channel, 3)
    education_score = education_scores.get(highest_education, 3)
    is_returning = 1 if opportunity_count > 1 else 0

    # Create feature array and convert to DataFrame with correct column names
    features = np.array([[intent_score, timing_score, channel_score, is_returning, education_score]])
    features_df = pd.DataFrame(features, columns=model_features)


    # Scale and predict
    features_scaled = scaler.transform(features_df)
    probability = model.predict_proba(features_scaled)[:, 1][0]

    return (probability * 100).round(2)

print("Lead scoring function ready!")

# Test examples
print("\n=== SCORING EXAMPLES ===")
print(f"High priority: {score_lead('Ready to enrol', 'within 3 months', 'Referral', 'Bachelors Degree', 1)}")
print(f"Medium priority: {score_lead('Researching options', 'within 6 months', 'SEO', 'Diploma', 1)}")
print(f"Low priority: {score_lead('Researching options', 'within 12 months', 'Email List', 'Year 12', 1)}")

# FOR COMET:End of Notebook
exp.end()