In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os
from IPython.display import display, HTML


In [13]:
# Function to load and clean data
def load_and_clean_data(file_path):
    # Remove any invisible characters from the file path
    file_path = file_path.strip().strip('\u202a').strip('\ufeff')
    
    # Check if the file exists
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"The file at {file_path} does not exist.")
    
    data = pd.read_csv(file_path)
    cleaned_data = data.dropna()
    cleaned_data = cleaned_data[(cleaned_data != 'Select').all(axis=1)]
    return data, cleaned_data

# Function to train and save the model
def train_and_save_model(cleaned_data):
    cleaned_data['Converted'] = cleaned_data['Converted'].astype(int)
    X = cleaned_data.drop(columns=['Prospect ID', 'Lead Number', 'Converted'])
    y = cleaned_data['Converted']

    # Convert categorical variables to dummy variables
    X = pd.get_dummies(X)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate model
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print(report)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    # Save model
    model_filename = 'lead_conversion_model.pkl'
    joblib.dump(model, model_filename)

    return model, model_filename, X_train.columns, accuracy


In [14]:
# Define the file path for training data
file_path = r'D:\GUVI\Project\Capstone Project\Power bi\Leads\Leads.csv'

# Print the file path for debugging
print(f"Using file path: {file_path}")

# Load and clean data from the given file path
original_data, cleaned_data = load_and_clean_data(file_path)

# Train and save model
model, model_filename, train_columns, accuracy = train_and_save_model(cleaned_data)


Using file path: D:\GUVI\Project\Capstone Project\Power bi\Leads\Leads.csv
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        24
           1       1.00      1.00      1.00        66

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90

Accuracy: 100.00%


In [15]:
if accuracy < 0.80:
    print("The model accuracy is below 80%. Please improve the model or data quality.")
else:
    print("The model accuracy is 80% or above. Proceeding with predictions.")


The model accuracy is 80% or above. Proceeding with predictions.


In [16]:
# Function to make predictions and display the results
def predict_and_display(file_path, model_filename='lead_conversion_model.pkl', train_columns=None):
    # Remove any invisible characters from the file path
    file_path = file_path.strip().strip('\u202a').strip('\ufeff')
    
    # Check if the file exists
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"The file at {file_path} does not exist.")
    
    model = joblib.load(model_filename)
    data = pd.read_csv(file_path)
    original_data = data.copy()
    
    # Clean data
    cleaned_data = data.dropna()
    cleaned_data = cleaned_data[(cleaned_data != 'Select').all(axis=1)]
    
    # Prepare data
    X = cleaned_data.drop(columns=['Prospect ID', 'Lead Number'])
    X = pd.get_dummies(X)
    
    # Align input data with training data
    X = X.reindex(columns=train_columns, fill_value=0)
    
    # Make predictions
    predictions = model.predict(X)
    prediction_proba = model.predict_proba(X)
    
    # Ensure lengths match
    if len(predictions) != len(original_data):
        raise ValueError(f"Length of values ({len(predictions)}) does not match length of index ({len(original_data)})")
    
    # Add predictions to original data
    original_data['prediction'] = predictions
    original_data['probability'] = prediction_proba.max(axis=1)
    original_data['action'] = original_data['prediction'].apply(lambda x: 'High chance of converting' if x == 1 else 'Low chance of converting')
    
    # Display the result
    display(HTML(original_data.to_html()))


In [50]:
if accuracy >= 0.80:
    # Prompt user for another file path to test the model
    test_file_path = input("Please enter the file path for the test data (e.g., D:\\GUVI\\Project\\Capstone Project\\Power bi\\dummy_leads.csv): ").strip()
    
    # Ensure the file path is correctly formatted
    test_file_path = test_file_path.strip().strip('\u202a').strip('\ufeff')

    # Check if the file exists
    if not os.path.isfile(test_file_path):
        raise FileNotFoundError(f"The file at {test_file_path} does not exist.")
    
    # Run the prediction and display the results
    predict_and_display(test_file_path, model_filename, train_columns)
else:
    print("Model accuracy is below 80%. Exiting without making predictions.")


Please enter the file path for the test data (e.g., D:\GUVI\Project\Capstone Project\Power bi\dummy_leads.csv):  C:\Users\Arivazhagan\Downloads\test_leads.csv


Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Last Activity,Country,Specialization,How did you hear about X Education,What is your current occupation,What matters most to you in choosing a course,Search,Magazine,Newspaper Article,X Education Forums,Newspaper,X Education Chat,Mobile,Social Media,Digital Advertisement,Through Recommendations,Receive More Updates About Our Courses,Tags,Lead Quality,Update me on Supply Chain Content,Get updates on DM Content,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,prediction,probability,action
0,1,1,Landing Page Submission,Google,No,No,0,0,0,0,Email Opened,India,Finance Management,Through Online Search,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,No,No,No,Will revert after reading the email,High,No,No,Mumbai,17.0,15.0,15,15,No,No,1,0.65,High chance of converting
1,2,2,API,Direct Traffic,No,No,1,5,300,3,Converted to Lead,India,HR Management,Through Online Search,Student,Better Career Prospects,No,No,No,No,No,No,No,No,No,No,No,Ringing,Low,No,No,Mumbai,15.0,18.0,10,10,No,No,0,0.51,Low chance of converting
2,3,3,Landing Page Submission,Google,No,No,0,4,250,4,Page Visited on Website,India,Marketing Management,Through Online Search,Student,Better Career Prospects,No,No,No,No,No,No,No,No,No,No,No,Will revert after reading the email,High,No,No,Mumbai,18.0,20.0,20,20,No,No,1,0.6,High chance of converting
3,4,4,API,Direct Traffic,No,No,1,3,200,3,Converted to Lead,India,Finance Management,Through Online Search,Working Professional,Better Career Prospects,No,No,No,No,No,No,No,No,No,No,No,Ringing,Low,No,No,Mumbai,14.0,15.0,10,10,No,No,0,0.5,Low chance of converting
4,5,5,Landing Page Submission,Google,No,No,0,5,280,5,Email Opened,India,Finance Management,Through Online Search,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,No,No,No,Will revert after reading the email,High,No,No,Mumbai,17.0,15.0,15,15,No,No,1,0.64,High chance of converting
5,6,6,API,Direct Traffic,No,No,1,4,240,4,Converted to Lead,India,HR Management,Through Online Search,Student,Better Career Prospects,No,No,No,No,No,No,No,No,No,No,No,Ringing,Low,No,No,Mumbai,15.0,18.0,10,10,No,No,0,0.53,Low chance of converting
6,7,7,Landing Page Submission,Google,No,No,0,3,220,3,Page Visited on Website,India,Marketing Management,Through Online Search,Student,Better Career Prospects,No,No,No,No,No,No,No,No,No,No,No,Will revert after reading the email,High,No,No,Mumbai,18.0,20.0,20,20,No,No,1,0.61,High chance of converting
7,8,8,API,Direct Traffic,No,No,1,5,300,5,Converted to Lead,India,Finance Management,Through Online Search,Working Professional,Better Career Prospects,No,No,No,No,No,No,No,No,No,No,No,Ringing,Low,No,No,Mumbai,14.0,15.0,10,10,No,No,1,0.52,High chance of converting
8,9,9,Landing Page Submission,Google,No,No,0,4,260,4,Email Opened,India,Finance Management,Through Online Search,Unemployed,Better Career Prospects,No,No,No,No,No,No,No,No,No,No,No,Will revert after reading the email,High,No,No,Mumbai,17.0,15.0,15,15,No,No,1,0.64,High chance of converting
9,10,10,API,Direct Traffic,No,No,1,3,200,3,Converted to Lead,India,HR Management,Through Online Search,Student,Better Career Prospects,No,No,No,No,No,No,No,No,No,No,No,Ringing,Low,No,No,Mumbai,15.0,18.0,10,10,No,No,0,0.52,Low chance of converting
