In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
import pickle
import bz2file as bz2

## Import dataset
leads_dataset = pd.read_csv('../Datasets/synthetic_leads.csv')
leads_dataset.columns = map(str.lower, leads_dataset.columns)

In [3]:
# Create data pre-processing steps before plugging into model
leads_categorical_columns = ['lead_source',
                             'country',
                             'gender',
                             'education_level',
                             'occupation',
                             'industry',
                             'initial_response',
                             'do_not_contact',
                             'general_knowledge',
                             'business_knowledge',
                             'company_size',
                             'lead_quality']

leads_numeric_columns = ['age',
                         'income',
                         'total_calls_attended',
                         'total_meetings_attended',
                         'company_estimated_revenue',]

leads_response_columns = ['lead_score']

In [4]:
#split data for training

leads_x = leads_dataset.drop(leads_response_columns, axis=1)
leads_y = pd.DataFrame(leads_dataset[leads_response_columns])

data_train, data_test, label_train, label_test = train_test_split(leads_x,
                                                                            leads_y,
                                                                            train_size=0.9,
                                                                            test_size=0.1,
                                                                            random_state=42)

In [5]:
scaler = StandardScaler()
scaler = scaler.fit(data_train[leads_numeric_columns])

In [6]:
def pre_process_leads_data(df,
                           numeric_columns,
                           categorical_columns,
                           fitted_scaler,
                           train_df_columns = None):
    ## create new df with selected columns
    df.columns = map(str.lower, df.columns)
    _df = df[list(numeric_columns + categorical_columns)].copy()

    ## scale the numeric columns with the pre-built scaler
    _df[numeric_columns] = fitted_scaler.transform(_df[numeric_columns])

    # First, make categorical text lowercase
    _df[categorical_columns] = _df[categorical_columns].apply(lambda x: x.str.lower())
    # Next, create one-hot-encoded variables, add to dataframe, drop old columns
    _df_dummies = pd.get_dummies(_df[categorical_columns])
    print(_df_dummies.head())
    _df = pd.concat([_df, _df_dummies], axis=1)

    _df.drop(categorical_columns, axis=1, inplace = True)

    if train_df_columns:
        _df = _df.reindex(columns=train_df_columns, fill_value=0)

    return _df

In [7]:
data_train_clean = pre_process_leads_data(df = data_train,
                                            numeric_columns = leads_numeric_columns,
                                            categorical_columns = leads_categorical_columns,
                                            fitted_scaler = scaler)

data_test_clean = pre_process_leads_data(df = data_test,
                                           numeric_columns = leads_numeric_columns,
                                           categorical_columns = leads_categorical_columns,
                                           fitted_scaler = scaler,
                                           train_df_columns = data_train_clean.columns.tolist())

      lead_source_inbound  lead_source_organic  lead_source_other   
8353                False                False              False  \
7649                False                 True              False   
2993                 True                False              False   
3369                False                False              False   
9128                False                 True              False   

      lead_source_paid  lead_source_referral  country_australia   
8353              True                 False              False  \
7649             False                 False              False   
2993             False                 False              False   
3369             False                  True              False   
9128             False                 False              False   

      country_brazil  country_canada  country_china  country_france  ...   
8353           False           False          False           False  ...  \
7649           False          

In [8]:
## Train the random forest model
num_estimators = 100
min_samples = 4

rf = RandomForestClassifier(n_estimators=num_estimators,
                            min_samples_split=min_samples)
rf.fit(data_train_clean, label_train.values.ravel())

In [9]:
## Test Model Accuracy
rf_pred = rf.predict(data_test_clean)
rf_pred = pd.DataFrame(rf_pred)
rf_acc = accuracy_score(label_test, rf_pred)
rf_cnf = confusion_matrix(label_test, rf_pred)

print('Accuracy:', rf_acc)
print('Confusion Matrix:')
print(rf_cnf)

Accuracy: 0.06260869565217392
Confusion Matrix:
[[0 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [1 3 0 ... 0 0 0]
 ...
 [0 0 0 ... 2 1 1]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 4 0 1]]


In [10]:
## Train Logistic Regression Model and Check Performance
LR_clf = LogisticRegression(max_iter=1000)
LR_clf.fit(data_train_clean, label_train.values.ravel())
LR_pred = LR_clf.predict(data_test_clean)

LR_acc = accuracy_score(label_test, LR_pred)
LR_cnf = confusion_matrix(label_test, LR_pred)
print('Accuracy:', LR_acc)
print('Confusion Matrix:')
print(LR_cnf)

Accuracy: 0.05478260869565217
Confusion Matrix:
[[2 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 2 0 2]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 3]]


In [15]:
#Dump the Model into a PKL file to add to the API
#Since Logistic Regressor performed better we pick that
#import bz2file as bz2
with bz2.BZ2File('model2' + '.pbz2', 'w') as f:
#    joblib.dump(rf, f)
    joblib.dump(rf,f)

In [16]:
#pickle.dump(scaler, open('scaler.pkl', 'wb'))
joblib.dump(scaler,'scaler2.joblib')

['scaler2.joblib']

In [12]:
leads_columns = ['lead_source',
                             'country',
                             'age',
                             'gender',
                             'education_level',
                             'occupation',
                             'industry',
                             'income',
                             'initial_response',
                             'do_not_contact',
                             'total_calls_attended',
                             'total_meetings_attended',
                             'general_knowledge',
                             'business_knowledge',
                             'company_size',
                             'company_estimated_revenue',
                             'lead_quality']
lead = [
    'REFERRAL',
    'Japan',
    24,
    'OTHER',
    'HIGH SCHOOL',
    'RETIRED',
    'MANUFACTURING',
    47000,
    'NEUTRAL',
    'Yes',
    1,
    0,
    'NOVICE',
    'INTERMEDIATE',
    'SMALL',
    72000,
    'COLD']

lead = pd.DataFrame([lead], columns= leads_columns)
lead[leads_categorical_columns] = lead[leads_categorical_columns].apply(lambda x: x.str.lower())
lead = pd.get_dummies(lead)
#lead.head()
#lead.head()
new_df = pd.DataFrame(columns=data_train_clean.columns)
new_df = pd.concat([new_df,lead], axis=0, ignore_index = True, sort = False)
empty_cols = new_df.columns[new_df.isnull().any()]
new_df[empty_cols] = new_df[empty_cols].fillna(False)
new_df[leads_numeric_columns] = scaler.transform(new_df[leads_numeric_columns])
rf.predict(new_df)
#lead_dummy = pd.get_dummies(lead,columns=leads_categorical_columns)
#lead_dummy.head()

array([39], dtype=int64)