In [47]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()
random.seed(42)

def get_lead_priority(quote_amount, quote_age_days, customer_type, source_channel, product_type, past_interactions, prior_orders):
    if quote_amount < 50000:
        quote_amount_score = 1
    elif quote_amount <= 150000:
        quote_amount_score = 2
    else:
        quote_amount_score = 3

    if quote_age_days <= 5:
        quote_age_score = 3
    elif quote_age_days <= 15:
        quote_age_score = 2
    else:
        quote_age_score = 1

    if customer_type in ['Contractor', 'Corporate']:
        customer_type_score = 3
    elif customer_type in ['Reseller', 'Government']:
        customer_type_score = 2
    else:
        customer_type_score = 1

    if source_channel in ['Website', 'Referral']:
        source_channel_score = 3
    elif source_channel in ['Walk-in', 'Email']:
        source_channel_score = 2
    else:
        source_channel_score = 1

    if product_type == 'High-margin':
        product_type_score = 3
    elif product_type == 'Low-margin':
        product_type_score = 1
    else:
        product_type_score = 2

    if past_interactions in ['5 (calls, visits)', '6 (calls, visits, emails)']:
        past_interactions_score = 3
    elif past_interactions in ['3 (calls, visits)', '2 (calls)']:
        past_interactions_score = 2
    else:
        past_interactions_score = 1

    if prior_orders >= 10:
        prior_orders_score = 3
    elif prior_orders >= 5:
        prior_orders_score = 2
    else:
        prior_orders_score = 1

    total_score = (
        quote_amount_score + quote_age_score + customer_type_score +
        source_channel_score + product_type_score + past_interactions_score +
        prior_orders_score
    )

    if total_score >= 18:
        lead_priority = 'High Priority'
    elif total_score >= 12:
        lead_priority = 'Medium Priority'
    else:
        lead_priority = 'Low Priority'

    return total_score, lead_priority


In [48]:
def generate_interactions():
    interactions = [
        '1 (visit)', '2 (calls)', '3 (calls, visits)',
        '4 (calls, visits)', '5 (calls, visits)',
        '6 (calls, visits, emails)'
    ]
    return random.choice(interactions)

descriptions = ['Service call', 'Installation request', 'Consultation']
work_descriptions = ['Install product', 'Repair machinery', 'Inspect equipment', 'Upgrade system']
customer_types = ['Contractor', 'Individual', 'NGO', 'Reseller', 'Government', 'Corporate']
source_channels = ['Website', 'Walk-in', 'FloApp', 'Shopify', 'Referral', 'Email']
product_types = ['High-margin', 'Low-margin', 'Mixed']
salespersons = ['PMAMAI', 'PMNYABUTO', 'PMMULWA', 'PMKILONZO', 'PMOMONDI']
salesperson_codes = ['514', '369', '445', '858', '106']

data = []
for i in range(3000):
    entry_no = i + 1
    date = fake.date_between(start_date='-1y', end_date='today')
    description = random.choice(descriptions)
    work_description = random.choice(work_descriptions)
    contact_no = fake.msisdn()[5:15]
    contact_name = fake.name()
    contact_company = fake.company()
    duration = random.randint(10, 180)
    salesperson_code = random.choice(salesperson_codes)
    user_id = random.randint(300, 400)

    quote_id = f"Q{100 + i}"
    quote_amount = random.randint(25000, 500000)
    quote_age_days = random.randint(0, 30)
    customer_type = random.choice(customer_types)
    source_channel = random.choice(source_channels)
    product_type = random.choices(product_types, weights=[0.5, 0.3, 0.2])[0]
    past_interactions = generate_interactions()
    prior_orders = random.randint(0, 15)
    salesperson = random.choice(salespersons)

    _, lead_priority = get_lead_priority(quote_amount, quote_age_days, customer_type, source_channel, product_type, past_interactions, prior_orders)

    data.append([
        entry_no, date, description, work_description, contact_no,
        contact_name, contact_company, duration, salesperson_code, user_id,
        quote_id, quote_amount, quote_age_days, customer_type, source_channel,
        product_type, past_interactions, prior_orders, salesperson, lead_priority
    ])
columns = ['Entry_No', 'Date', 'Description', 'WorkDescription', 'Contact_No',
           'Contact_Name', 'Contact_Company_Name', 'Duration_Min',
           'Salesperson_Code', 'User_ID', 'Quote_ID', 'Quote_Amount',
           'Quote_Age_Days', 'Customer_Type', 'Source_Channel',
           'Product_Type', 'Past_Interactions', 'Prior_Orders',
           'Salesperson', 'Lead_Priority']

df = pd.DataFrame(data, columns=columns)

In [None]:
 df.to_csv('Raw_data_Gen.csv', index=False)

In [68]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # Use OneHotEncoder with capital O
label_encoder = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # Use OneHotEncoder with capital O

categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

df['Customer_Type'] = label_encoder.fit_transform(df['Customer_Type'])
df['Source_Channel'] = label_encoder.fit_transform(df['Source_Channel'])
df['Product_Type'] = label_encoder.fit_transform(df['Product_Type'])
df['Salesperson'] = label_encoder.fit_transform(df['Salesperson'])
df['Lead_Priority'] = label_encoder.fit_transform(df['Lead_Priority'])

In [69]:
X = df.drop('Lead_Priority', axis=1)
y = df['Lead_Priority']

In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [80]:
def evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
        "SVM": SVC(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
        "K-Nearest Neighbors": KNeighborsClassifier()
    }

    reference_date = pd.to_datetime('2022-01-01')
    X_train['Date'] = (pd.to_datetime(X_train['Date']) - reference_date).dt.days
    X_test['Date'] = (pd.to_datetime(X_test['Date']) - reference_date).dt.days

    trained_models = {}

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        trained_models[model_name] = model

        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)

        print(f"--- {model_name} ---")
        print(f"Accuracy: {accuracy:.4f}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("Classification Report:")
        print(class_report)
        print("-" * 50)
    return trained_models

In [81]:
trained_models = evaluate_models(X_train, X_test, y_train, y_test)

--- Random Forest ---
Accuracy: 0.9050
Confusion Matrix:
[[ 14   0  25]
 [  0  10  31]
 [  1   0 519]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.36      0.52        39
           1       1.00      0.24      0.39        41
           2       0.90      1.00      0.95       520

    accuracy                           0.91       600
   macro avg       0.95      0.53      0.62       600
weighted avg       0.91      0.91      0.88       600

--------------------------------------------------


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


--- Logistic Regression ---
Accuracy: 0.8683
Confusion Matrix:
[[  1   0  38]
 [  0   0  41]
 [  0   0 520]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.03      0.05        39
           1       0.00      0.00      0.00        41
           2       0.87      1.00      0.93       520

    accuracy                           0.87       600
   macro avg       0.62      0.34      0.33       600
weighted avg       0.82      0.87      0.81       600

--------------------------------------------------
--- SVM ---
Accuracy: 0.8667
Confusion Matrix:
[[  0   0  39]
 [  0   0  41]
 [  0   0 520]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        39
           1       0.00      0.00      0.00        41
           2       0.87      1.00      0.93       520

    accuracy                           0.87       600
   macro avg       0.29      0.33      0.31     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


--- Gradient Boosting ---
Accuracy: 0.9250
Confusion Matrix:
[[ 23   0  16]
 [  0  16  25]
 [  1   3 516]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.59      0.73        39
           1       0.84      0.39      0.53        41
           2       0.93      0.99      0.96       520

    accuracy                           0.93       600
   macro avg       0.91      0.66      0.74       600
weighted avg       0.92      0.93      0.91       600

--------------------------------------------------
--- K-Nearest Neighbors ---
Accuracy: 0.8450
Confusion Matrix:
[[  0   0  39]
 [  0   0  41]
 [  9   4 507]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        39
           1       0.00      0.00      0.00        41
           2       0.86      0.97      0.92       520

    accuracy                           0.84       600
   macro avg       0.29      0.33 

The models we tested predict which leads are most likely to turn into sales. The **Gradient Boosting** model performed the best, accurately predicting sales most of the time, while **Random Forest** also did well. However, all models struggled with certain types of leads. In general, the models were great at predicting common leads but could improve in handling less common ones. We’ll continue to fine-tune these models to make them even more accurate in the future.



In [83]:
import joblib
joblib.dump(trained_models['Gradient Boosting'], 'Gradient_boosting.pkl')

['Gradient_boosting.pkl']