In [1]:
###### Lead Scoring Model ######
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Load the dataset
# the dataset is stored in a CSV file named 'lead.csv' in the same directory as the Python script
df = pd.read_csv('Lead.csv')


# Replace missing or NaN values with a common value
df = df.fillna('9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6c9bc9d493a23be9de0')



# Drop rows where STATUS is not WON or LOST
# This will remove any rows from the DataFrame where the STATUS column has a value other than 'WON' or 'LOST'.
# The resulting DataFrame will only contain leads that have been either won or lost.

df = df[df['status'].isin(['WON', 'LOST'])]



# Perform feature selection - keep columns 2-5, 7, and 9-13
# which would involve identifying which features are likely to be important based on your understanding of the business problem and the domain
df = df.iloc[:, [2, 3, 4, 5, 7, 9, 10, 11, 12, 13]]


# Convert all columns to categorical variables
df = df.astype('category')


# Encode categorical variables as numerical values
label_encoders = {}
for col in df.columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])
    
    

# Split the dataset into training and testing sets
# After running this code, you will have four variables: X_train, X_test, y_train, and y_test, which contain the training and testing input features and target variable, respectively.
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.2, random_state=42)



# Train a machine learning model - a random forest regressor
# This code first imports the RandomForestRegressor class from Scikit-learn's ensemble module.
# It then creates a new instance of the class and sets the n_estimators parameter to 100 and the random_state parameter to 42 (for reproducibility).
# Finally, it fits the model to the training data using the fit method of the RandomForestRegressor class.
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



# Predict the lead scores for the testing set
y_pred = model.predict(X_test)



# Evaluate the performance of the model
# Evaluate the performance of the model using regression evaluation metrics (mean squared error, mean absolute error, and R-squared) on the testing set.
print('Accuracy:', accuracy_score(y_test, y_pred.round()))
print('Precision:', precision_score(y_test, y_pred.round(), average='weighted'))
print('Recall:', recall_score(y_test, y_pred.round(), average='weighted'))
print('F1-score:', f1_score(y_test, y_pred.round(), average='weighted'))


Accuracy: 0.9869386873920553
Precision: 0.9890095330433372
Recall: 0.9869386873920553
F1-score: 0.9872438488756627


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
