In [None]:
#import required modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
import datetime
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE 
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import mlflow
import mlflow.sklearn
from mlflow import log_metric, log_param, log_artifact

#set aesthetics requirements
sns.set(style="white")
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.options.display.float_format = '{:,}'.format
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [None]:
# Reading the csv file and putting it into 'df' object.
leads = pd.read_csv('datasets_78342_179673_olist_marketing_qualified_leads_dataset.csv')
leads.head()

In [None]:
leads.info()

In [None]:
leads.describe(include="all")

In [None]:
leads.origin.value_counts()

In [None]:
leads['origin'] = leads['origin'].fillna('unknown')

In [None]:
leads.isnull().sum(axis=0)

In [None]:
leads.describe(include="all")

In [None]:
leads["first_contact_date"] = leads["first_contact_date"].astype("datetime64")

In [None]:
leads['first_contact_date'] = leads['first_contact_date'].dt.to_period('M')
leads.head()

In [None]:
closed = pd.read_csv('datasets_78342_179673_olist_closed_deals_dataset.csv')
closed.head()

In [None]:
closed.describe(include="all")

In [None]:
closed = closed[['mql_id', 'seller_id']]
closed.head()

In [None]:
data = pd.merge(leads,
                  closed,
                  how='left',
                  on='mql_id')
data.head()

In [None]:
data['seller_id'] = data['seller_id'].fillna(0)
data['seller_id'] = data['seller_id'].apply(lambda x: 1 if x else 0)
data.head()

In [None]:
monthly_conversion = data.groupby(by='first_contact_date')['seller_id'].agg(['count', 'sum'])
monthly_conversion['conversion_rate(%)'] = ((monthly_conversion['sum'] / monthly_conversion['count']) * 100).round(1)
monthly_conversion

In [None]:
# Plot the monthly conversion rate
monthly_conversion['conversion_rate(%)'].plot.line(figsize=(12, 6))
plt.title('Conversion Rate (Jun 2017 - May 2018)', fontsize=14)

### There is no seasonality is converion. Company is improving lead generation and conversion over time. Hence contact date is not useful to predict if lead will get converted. 

In [None]:
data.drop('first_contact_date', axis=1, inplace = True)
data.head()

### MLflow setup

In [None]:
encoder_landing_page_id = preprocessing.LabelEncoder()
encoder_landing_page_id.fit(data['landing_page_id'])
data['landing_page_id'] = encoder_landing_page_id.transform(data['landing_page_id'])

encoder_origin = preprocessing.LabelEncoder()
encoder_origin.fit(data['origin'])
data['origin'] = encoder_origin.transform(data['origin'])

data.head()

In [None]:
X = data[['landing_page_id', 'origin']]
y = data['seller_id']

In [None]:
X.head()

In [None]:
y.head()

### 80% train, 20% test

In [None]:
# Splitting the data into train and validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 
  
sm = SMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) 
  
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0))) 

In [None]:
# Running the random forest with default parameters.
rfc = RandomForestClassifier()
rfc.fit(X_train_res,y_train_res)

In [None]:
# Making predictions
predictions = rfc.predict(X_train_res)

# Let's check the report of our default model
print(classification_report(y_train_res,predictions))

# Making predictions
predictions = rfc.predict(X_test)

# Let's check the report of our default model
print(classification_report(y_test,predictions))

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'min_samples_split': range(2, 10, 2),
    'n_estimators': [10, 50, 100]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, scoring="recall",
                          cv = 3, n_jobs = -1,verbose = 1)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train_res, y_train_res)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get recall of',grid_search.best_score_,'using',grid_search.best_params_)

In [None]:
rfc = RandomForestClassifier(min_samples_split=4, n_estimators=50)
# fit
rfc.fit(X_train_res,y_train_res)

In [None]:
#validation

# Making predictions
predictions = rfc.predict(X_test)

# Let's check the report of our default model
print(classification_report(y_test,predictions))