In [20]:
import requests
import boto3
import mlflow
from mlflow import pyfunc as ml_pyfunc
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics

## Import dataset
leads_dataset = pd.read_csv('synthetic_leads.csv')
leads_dataset.columns = map(str.lower, leads_dataset.columns)

In [21]:
# Create data pre-processing steps before plugging into model
leads_categorical_columns = ['lead_source',
                             'country',
                             'gender',
                             'education_level',
                             'occupation',
                             'industry',
                             'initial_response',
                             'general_knowledge',
                             'business_knowledge',
                             'company_size',
                             'lead_quality']

leads_numeric_columns = ['age',
                         'income',
                         'total_calls_attended',
                         'total_meetings_attended',
                         'company_estimated_revenue',]

leads_response_columns = ['lead_score']

In [22]:
#split data for training, remove extras

leads_x = leads_dataset.drop(leads_response_columns, axis=1)
leads_y = leads_dataset[leads_response_columns]

leads_x_train, leads_x_test, leads_y_train, leads_y_test = train_test_split(leads_x,
                                                                            leads_y,
                                                                            train_size=0.7,
                                                                            test_size=0.3,
                                                                            random_state=5050)
leads_x_train.head()

Unnamed: 0,lead_source,country,age,gender,education_level,occupation,industry,income,initial_response,do_not_contact,total_calls_attended,total_meetings_attended,general_knowledge,business_knowledge,company_size,company_estimated_revenue,lead_quality
7072,OTHER,Canada,39,OTHER,PhD,EMPLOYEE,SERVICES,99000,POSITIVE,No,4,2,ADVANCED,ADVANCED,MEDIUM,2654000,HOT
4141,PAID,USA,51,OTHER,BACHELOR,BUSINESSMAN,RETAIL,153000,POSITIVE,No,3,3,ADVANCED,INTERMEDIATE,MEDIUM,2101000,HOT
6227,OTHER,Australia,38,FEMALE,COLLEGE,EMPLOYEE,RETAIL,93000,NEUTRAL,No,3,2,ADVANCED,INTERMEDIATE,MEDIUM,1605000,WARM
9804,OTHER,France,37,OTHER,BACHELOR,UNEMPLOYED,FINANCE,73000,NEGATIVE,Yes,1,0,EXPERT,BASIC,SMALL,744000,COLD
7930,OTHER,Pakistan,28,OTHER,MASTER,OTHER,FINANCE,87000,NEGATIVE,Yes,2,0,EXPERT,NOVICE,SMALL,619000,COLD


In [23]:
scaler = StandardScaler()
scaler = scaler.fit(leads_x_train[leads_numeric_columns])

In [24]:
def pre_process_leads_data(df,
                           numeric_columns,
                           categorical_columns,
                           fitted_scaler,
                           train_df_columns = None):
    ## create new df with selected columns
    df.columns = map(str.lower, df.columns)
    _df = df[list(numeric_columns + categorical_columns)].copy()

    ## scale the numeric columns with the pre-built scaler
    _df[numeric_columns] = fitted_scaler.transform(_df[numeric_columns])

    # First, make categorical text lowercase
    _df[categorical_columns] = _df[categorical_columns].apply(lambda x: x.str.lower())
    # Next, create one-hot-encoded variables, add to dataframe, drop old columns
    _df_dummies = pd.get_dummies(_df[categorical_columns], drop_first=True)
    _df = pd.concat([_df, _df_dummies], axis=1)
    _df.drop(categorical_columns, axis=1, inplace = True)

    if train_df_columns:
        _df = _df.reindex(columns=train_df_columns, fill_value=0)

    return _df

In [25]:
leads_x_train_clean = pre_process_leads_data(df = leads_x_train,
                                            numeric_columns = leads_numeric_columns,
                                            categorical_columns = leads_categorical_columns,
                                            fitted_scaler = scaler)

leads_x_test_clean = pre_process_leads_data(df = leads_x_test,
                                           numeric_columns = leads_numeric_columns,
                                           categorical_columns = leads_categorical_columns,
                                           fitted_scaler = scaler,
                                           train_df_columns = leads_x_train_clean.columns.tolist())

In [28]:
leads_x_train_clean.head()

Unnamed: 0,age,income,total_calls_attended,total_meetings_attended,company_estimated_revenue,lead_source_organic,lead_source_other,lead_source_paid,lead_source_referral,country_brazil,...,general_knowledge_intermediate,general_knowledge_novice,business_knowledge_basic,business_knowledge_expert,business_knowledge_intermediate,business_knowledge_novice,company_size_medium,company_size_small,lead_quality_hot,lead_quality_warm
7072,-0.143605,-0.28034,0.135095,0.227912,-0.515709,False,True,False,False,False,...,False,False,False,False,False,False,True,False,True,False
4141,1.093861,0.902358,-0.423809,1.145328,-0.693591,False,False,True,False,False,...,False,False,False,False,True,False,True,False,True,False
6227,-0.246727,-0.411751,-0.423809,0.227912,-0.853138,False,True,False,False,False,...,False,False,False,False,True,False,True,False,False,True
9804,-0.349849,-0.849787,-1.541618,-1.606919,-1.130094,False,True,False,False,False,...,False,False,True,False,False,False,False,True,False,False
7930,-1.277948,-0.543162,-0.982714,-1.606919,-1.170302,False,True,False,False,False,...,False,False,False,False,False,True,False,True,False,False


In [31]:
logm1 = sm.GLM(leads_y_train.astype(float),(sm.add_constant(leads_x_test_clean)), family = sm.families.Binomial())
logm1.fit().summary()

ValueError: Pandas data cast to numpy dtype of object. Check apiInput data with np.asarray(data). The types seen werelead_score    float64
dtype: object and const                              float64
age                                float64
income                             float64
total_calls_attended               float64
total_meetings_attended            float64
company_estimated_revenue          float64
lead_source_organic                   bool
lead_source_other                     bool
lead_source_paid                      bool
lead_source_referral                  bool
country_brazil                        bool
country_canada                        bool
country_china                         bool
country_france                        bool
country_germany                       bool
country_india                         bool
country_japan                         bool
country_pakistan                      bool
country_uk                            bool
country_usa                           bool
gender_male                           bool
gender_other                          bool
education_level_college               bool
education_level_high school           bool
education_level_master                bool
education_level_phd                   bool
occupation_employee                   bool
occupation_other                      bool
occupation_retired                    bool
occupation_self-employed              bool
occupation_unemployed                 bool
industry_healthcare                   bool
industry_manufacturing                bool
industry_retail                       bool
industry_services                     bool
industry_technology                   bool
initial_response_neutral              bool
initial_response_positive             bool
general_knowledge_basic               bool
general_knowledge_expert              bool
general_knowledge_intermediate        bool
general_knowledge_novice              bool
business_knowledge_basic              bool
business_knowledge_expert             bool
business_knowledge_intermediate       bool
business_knowledge_novice             bool
company_size_medium                   bool
company_size_small                    bool
lead_quality_hot                      bool
lead_quality_warm                     bool
dtype: object. The data was
      lead_score
7072        52.0
4141        93.0
6227        74.0
9804        42.0
7930        18.0
...          ...
3153        87.0
5491        65.0
5748        55.0
5271        63.0
6497        74.0

[7000 rows x 1 columns]
and
       const       age    income  total_calls_attended   
6139    1.0 -0.246727 -0.433653              0.135095  \
1012    1.0  1.815716  1.822234              1.811809   
1784    1.0  1.609471  1.077572              0.135095   
9452    1.0 -1.174826 -0.280340             -1.541618   
1329    1.0  1.815716  0.442420             -0.423809   
...     ...       ...       ...                   ...   
2597    1.0 -0.040483  1.756528              0.135095   
8487    1.0 -2.206047 -0.981198             -0.982714   
5036    1.0  0.268884 -0.433653             -0.423809   
5687    1.0 -0.556093 -0.630769              0.135095   
9026    1.0 -1.690437 -0.740278             -1.541618   

      total_meetings_attended  company_estimated_revenue  lead_source_organic   
6139                 0.227912                  -0.227495                False  \
1012                 0.227912                   1.606008                False   
1784                 0.227912                   0.749087                False   
9452                -1.606919                  -1.173841                False   
1329                 0.227912                   1.406896                False   
...                       ...                        ...                  ...   
2597                 0.227912                   0.897375                False   
8487                -1.606919                  -1.224342                 True   
5036                 0.227912                  -0.610280                False   
5687                 0.227912                   0.714347                False   
9026                -1.606919                  -1.205686                False   

      lead_source_other  lead_source_paid  lead_source_referral  ...   
6139               True             False                 False  ...  \
1012              False              True                 False  ...   
1784              False              True                 False  ...   
9452              False             False                  True  ...   
1329              False             False                  True  ...   
...                 ...               ...                   ...  ...   
2597              False              True                 False  ...   
8487              False             False                 False  ...   
5036              False              True                 False  ...   
5687              False              True                 False  ...   
9026              False              True                 False  ...   

      general_knowledge_intermediate  general_knowledge_novice   
6139                            True                     False  \
1012                           False                     False   
1784                           False                     False   
9452                           False                      True   
1329                           False                     False   
...                              ...                       ...   
2597                           False                      True   
8487                           False                     False   
5036                            True                     False   
5687                           False                      True   
9026                           False                     False   

      business_knowledge_basic  business_knowledge_expert   
6139                     False                      False  \
1012                     False                      False   
1784                     False                      False   
9452                     False                      False   
1329                     False                      False   
...                        ...                        ...   
2597                     False                      False   
8487                     False                      False   
5036                     False                      False   
5687                     False                      False   
9026                     False                       True   

      business_knowledge_intermediate  business_knowledge_novice   
6139                            False                      False  \
1012                            False                      False   
1784                             True                      False   
9452                            False                       True   
1329                             True                      False   
...                               ...                        ...   
2597                             True                      False   
8487                            False                       True   
5036                             True                      False   
5687                             True                      False   
9026                            False                      False   

      company_size_medium  company_size_small  lead_quality_hot   
6139                 True               False             False  \
1012                 True               False              True   
1784                 True               False              True   
9452                False                True             False   
1329                 True               False             False   
...                   ...                 ...               ...   
2597                 True               False              True   
8487                False                True             False   
5036                 True               False              True   
5687                 True               False              True   
9026                False                True             False   

      lead_quality_warm  
6139               True  
1012              False  
1784              False  
9452              False  
1329               True  
...                 ...  
2597              False  
8487              False  
5036              False  
5687              False  
9026              False  

[3000 rows x 50 columns]
before. After,
[[52.]
 [93.]
 [74.]
 ...
 [55.]
 [63.]
 [74.]]
[[1.0 -0.246727056978703 -0.43365269561013614 ... False False True]
 [1.0 1.8157155019548061 1.822233663850669 ... False True False]
 [1.0 1.6094712460614553 1.0775721471354518 ... False True False]
 ...
 [1.0 0.2688835827546743 -0.43365269561013614 ... False True False]
 [1.0 -0.5560934408187294 -0.6307689794465172 ... False True False]
 [1.0 -1.6904368482321595 -0.7402780260222844 ... True False False]].

In [32]:
logreg = LogisticRegression()

In [33]:
rfe = RFE(estimator=logreg, n_features_to_select= 15)
rfe.fit(leads_x_train_clean,leads_y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

In [34]:
rfe_col = list(leads_x_train_clean.columns[rfe.support_])
rfe_col

['age',
 'income',
 'total_calls_attended',
 'total_meetings_attended',
 'company_estimated_revenue',
 'lead_source_other',
 'lead_source_referral',
 'education_level_master',
 'occupation_employee',
 'occupation_unemployed',
 'initial_response_neutral',
 'initial_response_positive',
 'company_size_medium',
 'lead_quality_hot',
 'lead_quality_warm']

In [35]:
leads_x_train_clean_rfe = leads_x_train_clean[rfe_col]
leads_x_train_clean_rfe.head()

Unnamed: 0,age,income,total_calls_attended,total_meetings_attended,company_estimated_revenue,lead_source_other,lead_source_referral,education_level_master,occupation_employee,occupation_unemployed,initial_response_neutral,initial_response_positive,company_size_medium,lead_quality_hot,lead_quality_warm
7072,-0.143605,-0.28034,0.135095,0.227912,-0.515709,True,False,False,True,False,False,True,True,True,False
4141,1.093861,0.902358,-0.423809,1.145328,-0.693591,False,False,False,False,False,False,True,True,True,False
6227,-0.246727,-0.411751,-0.423809,0.227912,-0.853138,True,False,False,True,False,True,False,True,False,True
9804,-0.349849,-0.849787,-1.541618,-1.606919,-1.130094,True,False,False,False,True,False,False,False,False,False
7930,-1.277948,-0.543162,-0.982714,-1.606919,-1.170302,True,False,True,False,False,False,False,False,False,False


In [36]:
leads_x_test_clean_rfe = leads_x_test_clean[rfe_col]
leads_x_test_clean_rfe.head()

Unnamed: 0,age,income,total_calls_attended,total_meetings_attended,company_estimated_revenue,lead_source_other,lead_source_referral,education_level_master,occupation_employee,occupation_unemployed,initial_response_neutral,initial_response_positive,company_size_medium,lead_quality_hot,lead_quality_warm
6139,-0.246727,-0.433653,0.135095,0.227912,-0.227495,True,False,False,True,False,False,True,True,False,True
1012,1.815716,1.822234,1.811809,0.227912,1.606008,False,False,False,False,False,False,True,True,True,False
1784,1.609471,1.077572,0.135095,0.227912,0.749087,False,False,False,False,False,False,True,True,True,False
9452,-1.174826,-0.28034,-1.541618,-1.606919,-1.173841,False,True,True,False,False,False,False,False,False,False
1329,1.815716,0.44242,-0.423809,0.227912,1.406896,False,True,False,False,False,False,True,True,False,True


In [37]:
## Train the random forest model
num_estimators = 100
min_samples = 4

rf = RandomForestClassifier(n_estimators=num_estimators,
                            min_samples_split=min_samples)
rf.fit(leads_x_train_clean, leads_y_train.values.ravel())

In [38]:
leads_y_test_predicted = rf.predict(leads_x_test_clean)
leads_y_test_predicted = pd.DataFrame(leads_y_test_predicted)

accuracy = metrics.accuracy_score(leads_y_test, leads_y_test_predicted)
#auc_score = metrics.roc_auc_score(leads_y_test, leads_y_test_predicted, multi_class='ovr')

print(accuracy)
#print(auc_score)

0.050666666666666665


In [39]:
leads_y_test_predicted = pd.DataFrame(leads_y_test_predicted)
leads_y_test_predicted.head()

Unnamed: 0,0
0,60
1,96
2,94
3,19
4,79


In [40]:
leads_y_test.head()

Unnamed: 0,lead_score
6139,64
1012,89
1784,77
9452,13
1329,85


In [41]:
lr = sm.GLM(leads_y_train,leads_x_train_clean_rfe, family = sm.families.Binomial()).fit()

ValueError: Pandas data cast to numpy dtype of object. Check apiInput data with np.asarray(data). The types seen werelead_score    int64
dtype: object and age                          float64
income                       float64
total_calls_attended         float64
total_meetings_attended      float64
company_estimated_revenue    float64
lead_source_other               bool
lead_source_referral            bool
education_level_master          bool
occupation_employee             bool
occupation_unemployed           bool
initial_response_neutral        bool
initial_response_positive       bool
company_size_medium             bool
lead_quality_hot                bool
lead_quality_warm               bool
dtype: object. The data was
      lead_score
7072          52
4141          93
6227          74
9804          42
7930          18
...          ...
3153          87
5491          65
5748          55
5271          63
6497          74

[7000 rows x 1 columns]
and
            age    income  total_calls_attended  total_meetings_attended   
7072 -0.143605 -0.280340              0.135095                 0.227912  \
4141  1.093861  0.902358             -0.423809                 1.145328   
6227 -0.246727 -0.411751             -0.423809                 0.227912   
9804 -0.349849 -0.849787             -1.541618                -1.606919   
7930 -1.277948 -0.543162             -0.982714                -1.606919   
...        ...       ...                   ...                      ...   
3153 -0.246727  0.048187             -0.423809                 0.227912   
5491  0.372006 -0.608867             -0.423809                 0.227912   
5748  0.268884 -0.543162             -0.423809                 0.227912   
5271 -0.040483 -0.389849              0.135095                 0.227912   
6497 -0.452971 -0.652671             -0.423809                 0.227912   

      company_estimated_revenue  lead_source_other  lead_source_referral   
7072                  -0.515709               True                 False  \
4141                  -0.693591              False                 False   
6227                  -0.853138               True                 False   
9804                  -1.130094               True                 False   
7930                  -1.170302               True                 False   
...                         ...                ...                   ...   
3153                   0.258866              False                 False   
5491                   1.361220              False                 False   
5748                   0.394931               True                 False   
5271                   0.860384              False                 False   
6497                   0.927934              False                 False   

      education_level_master  occupation_employee  occupation_unemployed   
7072                   False                 True                  False  \
4141                   False                False                  False   
6227                   False                 True                  False   
9804                   False                False                   True   
7930                    True                False                  False   
...                      ...                  ...                    ...   
3153                   False                False                  False   
5491                    True                 True                  False   
5748                    True                False                   True   
5271                    True                 True                  False   
6497                    True                False                   True   

      initial_response_neutral  initial_response_positive   
7072                     False                       True  \
4141                     False                       True   
6227                      True                      False   
9804                     False                      False   
7930                     False                      False   
...                        ...                        ...   
3153                     False                       True   
5491                      True                      False   
5748                      True                      False   
5271                      True                      False   
6497                      True                      False   

      company_size_medium  lead_quality_hot  lead_quality_warm  
7072                 True              True              False  
4141                 True              True              False  
6227                 True             False               True  
9804                False             False              False  
7930                False             False              False  
...                   ...               ...                ...  
3153                 True              True              False  
5491                 True             False               True  
5748                 True              True              False  
5271                 True             False               True  
6497                 True              True              False  

[7000 rows x 15 columns]
before. After,
[[52]
 [93]
 [74]
 ...
 [55]
 [63]
 [74]]
[[-0.14360492903202754 -0.280340030404062 0.1350951979872874 ... True
  True False]
 [1.093860606328078 0.9023576726142242 -0.4238092854116561 ... True True
  False]
 [-0.246727056978703 -0.4117508862949827 -0.4238092854116561 ... True
  False True]
 ...
 [0.2688835827546743 -0.5431617421859034 -0.4238092854116561 ... True
  True False]
 [-0.04048280108535209 -0.3898490769798293 0.1350951979872874 ... True
  False True]
 [-0.45297131287205394 -0.6526707887616706 -0.4238092854116561 ... True
  True False]].

In [42]:
leads_y_test_predicted = rf.predict(leads_x_test_clean)
leads_y_test_predicted = pd.DataFrame(leads_y_test_predicted)

accuracy = metrics.accuracy_score(leads_y_test, leads_y_test_predicted)
#auc_score = metrics.roc_auc_score(leads_y_test, leads_y_test_predicted, multi_class='ovr')

print(accuracy)

0.050666666666666665


In [43]:
from sklearn.metrics import accuracy_score, confusion_matrix

LR_clf = LogisticRegression(max_iter=1000)
LR_clf.fit(leads_x_train_clean_rfe, leads_y_train)
LR_pred = LR_clf.predict(leads_x_test_clean_rfe)

LR_acc = accuracy_score(leads_y_test, LR_pred)
LR_cnf = confusion_matrix(leads_y_test, LR_pred)
print('Accuracy:', LR_acc)
print('Confusion Matrix:')
print(LR_cnf)

  y = column_or_1d(y, warn=True)


Accuracy: 0.052
Confusion Matrix:
[[0 5 3 ... 0 0 0]
 [2 4 0 ... 0 0 0]
 [1 8 1 ... 0 0 0]
 ...
 [0 0 0 ... 3 8 0]
 [0 0 0 ... 3 8 0]
 [0 0 0 ... 5 7 0]]


In [44]:
leads_y_test.head()

Unnamed: 0,lead_score
6139,64
1012,89
1784,77
9452,13
1329,85


In [48]:
LR_pred[2]

87