In [62]:
import requests
import boto3
import mlflow
from mlflow import pyfunc as ml_pyfunc
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics

## Import dataset
leads_dataset = pd.read_csv('Datasets/synthetic_leads.csv')
leads_dataset.columns = map(str.lower, leads_dataset.columns)

In [63]:
# Create data pre-processing steps before plugging into model
leads_categorical_columns = ['lead_source',
                             'country',
                             'gender',
                             'education_level',
                             'occupation',
                             'industry',
                             'initial_response',
                             'general_knowledge',
                             'business_knowledge',
                             'company_size',
                             'lead_quality']

leads_numeric_columns = ['age',
                         'income',
                         'total_calls_attended',
                         'total_meetings_attended',
                         'company_estimated_revenue',]

leads_response_columns = ['lead_score']

In [64]:
#split data for training, remove extras

leads_x = leads_dataset.drop(leads_response_columns, axis=1)
leads_y = pd.DataFrame(leads_dataset[leads_response_columns])

data_train, data_test, label_train, label_test = train_test_split(leads_x,
                                                                            leads_y,
                                                                            train_size=0.9,
                                                                            test_size=0.1,
                                                                            random_state=100)

In [65]:
data_train.info()
label_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9000 entries, 351 to 5640
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   lead_source                9000 non-null   object
 1   country                    9000 non-null   object
 2   age                        9000 non-null   int64 
 3   gender                     9000 non-null   object
 4   education_level            9000 non-null   object
 5   occupation                 9000 non-null   object
 6   industry                   9000 non-null   object
 7   income                     9000 non-null   int64 
 8   initial_response           9000 non-null   object
 9   do_not_contact             9000 non-null   object
 10  total_calls_attended       9000 non-null   int64 
 11  total_meetings_attended    9000 non-null   int64 
 12  general_knowledge          9000 non-null   object
 13  business_knowledge         9000 non-null   object
 14  company_siz

In [66]:
scaler = StandardScaler()
scaler = scaler.fit(data_train[leads_numeric_columns])

In [67]:
def pre_process_leads_data(df,
                           numeric_columns,
                           categorical_columns,
                           fitted_scaler,
                           train_df_columns = None):
    ## create new df with selected columns
    df.columns = map(str.lower, df.columns)
    _df = df[list(numeric_columns + categorical_columns)].copy()

    ## scale the numeric columns with the pre-built scaler
    _df[numeric_columns] = fitted_scaler.transform(_df[numeric_columns])

    # First, make categorical text lowercase
    _df[categorical_columns] = _df[categorical_columns].apply(lambda x: x.str.lower())
    # Next, create one-hot-encoded variables, add to dataframe, drop old columns
    _df_dummies = pd.get_dummies(_df[categorical_columns], drop_first=True)
    _df = pd.concat([_df, _df_dummies], axis=1)
    _df.drop(categorical_columns, axis=1, inplace = True)

    if train_df_columns:
        _df = _df.reindex(columns=train_df_columns, fill_value=0)

    return _df

In [68]:
data_train_clean = pre_process_leads_data(df = data_train,
                                            numeric_columns = leads_numeric_columns,
                                            categorical_columns = leads_categorical_columns,
                                            fitted_scaler = scaler)

data_test_clean = pre_process_leads_data(df = data_test,
                                           numeric_columns = leads_numeric_columns,
                                           categorical_columns = leads_categorical_columns,
                                           fitted_scaler = scaler,
                                           train_df_columns = data_train_clean.columns.tolist())

In [69]:
data_train_clean.info()
data_test_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9000 entries, 351 to 5640
Data columns (total 49 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   age                              9000 non-null   float64
 1   income                           9000 non-null   float64
 2   total_calls_attended             9000 non-null   float64
 3   total_meetings_attended          9000 non-null   float64
 4   company_estimated_revenue        9000 non-null   float64
 5   lead_source_organic              9000 non-null   bool   
 6   lead_source_other                9000 non-null   bool   
 7   lead_source_paid                 9000 non-null   bool   
 8   lead_source_referral             9000 non-null   bool   
 9   country_brazil                   9000 non-null   bool   
 10  country_canada                   9000 non-null   bool   
 11  country_china                    9000 non-null   bool   
 12  country_france         

In [70]:
logreg = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=logreg, n_features_to_select= 15)
rfe.fit(data_train_clean,label_train.values.ravel())

In [71]:
rfe_col = list(data_train_clean.columns[rfe.support_])
rfe_col

['age',
 'income',
 'total_calls_attended',
 'total_meetings_attended',
 'company_estimated_revenue',
 'lead_source_other',
 'lead_source_referral',
 'education_level_master',
 'occupation_employee',
 'occupation_unemployed',
 'initial_response_neutral',
 'initial_response_positive',
 'company_size_medium',
 'lead_quality_hot',
 'lead_quality_warm']

In [72]:
data_train_clean_rfe = data_train_clean[rfe_col]
data_test_clean_rfe = data_test_clean[rfe_col]

In [73]:
## Train the random forest model
num_estimators = 100
min_samples = 4

rf = RandomForestClassifier(n_estimators=num_estimators,
                            min_samples_split=min_samples)
rf.fit(data_train_clean, label_train.values.ravel())

In [74]:
rf_pred = rf.predict(data_test_clean)
rf_pred = pd.DataFrame(rf_pred)
rf_acc = accuracy_score(label_test, rf_pred)
rf_cnf = confusion_matrix(label_test, rf_pred)

print('Accuracy:', rf_acc)
print('Confusion Matrix:')
print(rf_cnf)

Accuracy: 0.046
Confusion Matrix:
[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 1 1]
 [0 0 0 ... 2 2 0]
 [0 0 0 ... 0 1 0]]


In [75]:
label_test.head()

Unnamed: 0,lead_score
8018,32
9225,22
3854,82
2029,79
3539,99


In [76]:
rf_pred.head()

Unnamed: 0,0
0,20
1,39
2,98
3,84
4,93


In [77]:
LR_clf = LogisticRegression()
LR_clf.fit(data_train_clean, label_train.values.ravel())
LR_pred = LR_clf.predict(data_test_clean)

LR_acc = accuracy_score(label_test, LR_pred)
LR_cnf = confusion_matrix(label_test, LR_pred)
print('Accuracy:', LR_acc)
print('Confusion Matrix:')
print(LR_cnf)

Accuracy: 0.046
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 2 1 0]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
