In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
data = pd.read_csv(r'/content/drive/MyDrive/Raw_data.csv')
data.head()

In [None]:
data.drop(columns = ['@odata.etag', 'Canceled', 'Attempt_Failed', 'Document_Type', 'Document_No', 'Time_of_Interaction', 'Interaction_Template_Code', 'Attachment', 'Contact_Company_No', 'Campaign_Response', 'Campaign_Entry_No', 'Campaign_Target'], inplace = True)

In [None]:
data.isnull().sum().sum()

In [None]:
data.dropna(axis = 1, how = 'all', inplace = True)

In [None]:
data.head(3)

In [None]:
print(data.columns.to_list())

In [None]:
data.drop(columns = ['Delivery_Status', 'Correspondence_Type', 'Information_Flow', 'Initiated_By', 'Evaluation', 'Cost_LCY', 'Campaign_No', 'Comment', 'Line_No', 'Type', 'FilteredTypeField', 'No', 'IC_Partner_Ref_Type', 'Nonstock', 'VAT_Percent', 'Location_Code', 'Bin_Code', 'Quantity', 'Unit_of_Measure_Code', 'Unit_of_Measure', 'Unit_Cost_LCY', 'PriceExists', 'Unit_Price', 'Tax_Liable', 'Line_Discount_Percent', 'VAT_Prod_Posting_Group', 'Line_Amount', 'LineDiscExists', 'Line_Discount_Amount', 'Allow_Invoice_Disc', 'Inv_Discount_Amount', 'Allow_Item_Charge_Assignment', 'Qty_to_Assign', 'Qty_Assigned', 'Job_No', 'Job_Task_No', 'Job_Contract_Entry_No', 'Blanket_Order_Line_No', 'FA_Posting_Date', 'Depr_until_FA_Posting_Date', 'Depreciation_Book_Code', 'Use_Duplication_List', 'Appl_from_Item_Entry', 'Appl_to_Item_Entry', 'TIMS_Item_Code', 'Shortcut_Dimension_1_Code', 'Shortcut_Dimension_2_Code', 'ShortcutDimCode3', 'ShortcutDimCode4', 'ShortcutDimCode5', 'ShortcutDimCode6', 'TotalSalesLine_Line_Amount', 'Invoice_Discount_Amount', 'Invoice_Disc_Pct', 'Invoice_Discount_Percent', 'Total_Amount_Excl_VAT', 'Total_VAT_Amount', 'Total_Amount_Incl_VAT', 'Sell_to_Customer_No', 'Sell_to_Customer_Name', 'Posting_Description', 'Retail_Type', 'Sell_to_Address', 'Sell_to_Address_2', 'Sell_to_City', 'Sell_to_County', 'Sell_to_Post_Code', 'Sell_to_Country_Region_Code', 'Sell_to_Contact_No', 'Sell_to_Contact', 'Your_Reference', 'Document_Date', 'Posting_Date', 'Due_Date', 'Incoming_Document_Entry_No', 'External_Document_No', 'ShortCutDim1Name', 'ShortCutDim2Name', 'Responsibility_Center', 'Assigned_User_ID', 'Status', 'Credit_Status_TMN', 'Exit_Point', 'Job_Queue_Status', 'Sell_to_Phone_No', 'Amount_To_Pay', 'Amount_Paid', 'Reviewer_Status', 'Reviewer_ID', 'Reviewed_Date', 'Enable_LPO_Authorization', 'Enable_Customer_Ticket', 'Currency_Code', 'Shipment_Date', 'Quote_No', 'Prices_Including_VAT', 'VAT_Bus_Posting_Group', 'Payment_Terms_Code', 'Payment_Method_Code', 'EU_3_Party_Trade', 'SelectedPayments', 'Payment_Discount_Percent', 'Pmt_Discount_Date', 'ShippingOptions', 'Ship_to_Name', 'Ship_to_Address', 'Ship_to_Address_2', 'Ship_to_City', 'Ship_to_County', 'Ship_to_Post_Code', 'Ship_to_Country_Region_Code', 'Ship_to_Contact', 'BillToOptions', 'Bill_to_Name', 'Bill_to_Address', 'Bill_to_Address_2', 'Bill_to_City', 'Bill_to_County', 'Bill_to_Post_Code', 'Bill_to_Country_Region_Code', 'Bill_to_Contact_No', 'Bill_to_Contact', 'Applies_to_Doc_Type', 'Applies_to_Doc_No'],inplace = True)

In [None]:
data.columns


In [None]:
data.drop(columns=['Duration_Min'], inplace= True)

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.head(3)

In [None]:
data['WorkDescription'] = data.WorkDescription.fillna('No Description')
data['Description'] = data.Description.fillna('No Description')
data['Salesperson_Code'] = data.Salesperson_Code.fillna('No Salesperson')

In [None]:
data.isnull().sum()

In [None]:
data_cleaned = data.dropna(subset=['Entry_No'])
data_cleaned = data_cleaned.dropna(subset=['Contact_Company_Name'])


In [None]:
data_cleaned.isnull().sum()

# LEAD SCORING LOGIC



*    If all three fields are filled, the lead is considered High priority, indicating strong qualification and readiness for follow-up
*   If two fields are filled, the lead is marked as Medium, suggesting partial qualification.
*  Leads with only one filled field are labeled Low, showing minimal engagement, while those with none are assigned Very Low, indicating insufficient data to act upon


In [None]:
def lead_scoring(row):

  filled_fields = 0

  if row['Salesperson_Code'] != 'No Salesperson':
    filled_fields += 1
  if row['WorkDescription'] != 'No Description':
    filled_fields += 1
  if row['Description'] != 'No Description':
    filled_fields += 1

  if filled_fields >= 3:
    return 'High'
  elif filled_fields >= 2:
    return 'Medium'
  elif filled_fields >= 1:
    return 'Low'
  else:
    return 'Very Low'

data_cleaned['Lead_Score'] = data.apply(lead_scoring, axis = 1)

In [None]:
print(data_cleaned['Lead_Score'].value_counts())

In [None]:
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns
numerical_columns = data_cleaned.select_dtypes(include=['int64', 'float64']).columns

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
label_encoder = LabelEncoder()
min_max_scaler = MinMaxScaler()

for col in categorical_columns:
  data_cleaned[col] = label_encoder.fit_transform(data_cleaned[col])

for col in numerical_columns:
  data_cleaned[col] = min_max_scaler.fit_transform(data_cleaned[[col]])

In [None]:
data_cleaned.head()

In [None]:
X = data_cleaned.drop(columns = ['Lead_Score'])
y = data_cleaned['Lead_Score']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


RFC = RandomForestClassifier(random_state=42)

RFC.fit(X_train, y_train)

y_pred_rfc = RFC.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rfc))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rfc))


Accuracy: The model is correctly classifying almost all leads, with 99.95% of predictions being accurate.

Precision: Both classes (0 and 1) have perfect precision, meaning that when the model predicts a class, it's almost always correct.

Recall: Both classes also have perfect recall, meaning the model correctly identifies nearly all instances of both classes.

F1-Score: The F1-score is 1 for both classes, indicating a perfect balance between precision and recall.

In [None]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

xgb_model = xgb.XGBClassifier(random_state=42)

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))


In [None]:
from sklearn.model_selection import cross_val_score

cross_val_scores = cross_val_score(RFC, X, y, cv=5)
print("Cross-validation scores:", cross_val_scores)
print("Average cross-validation score:", cross_val_scores.mean())


Ths is an indication that the Random Forest Classifier is generalizing well across the data, no sign of overfitting

In [None]:
import joblib
joblib.dump(RFC, 'lead_scoring_model.pkl')

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the model
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Fitting 5 folds for each of 1350 candidates, totalling 6750 fits
