In [16]:
import pandas as pd
import numpy as np
from category_encoders import BinaryEncoder

In [17]:
raw_leads = pd.read_csv("../Data/Raw/leads.csv")
raw_offers = pd.read_csv("../Data/Raw/offers.csv")

In [18]:
# Converting string "nan" into null values since they generated False Negatives values 
raw_leads.loc[raw_leads["Id"] == "nan", "Id"] = np.nan
raw_offers.loc[raw_offers["Id"] == "nan", "Id"] = np.nan

# Drop duplicated IDs
raw_offers_no_duplicates = raw_offers.drop_duplicates(subset=["Id"])

# Creating a dataframe without null values on ID column
leads_no_ID_nan = raw_leads[raw_leads["Id"].notna()].copy()
offers_no_ID_nan = raw_offers_no_duplicates[raw_offers_no_duplicates["Id"].notna()].copy()

# Filling the null data with a new category (Not_specified) 
leads_no_ID_nan.fillna("Not_Specified", inplace=True)
offers_no_ID_nan = offers_no_ID_nan.astype(str).replace('nan', 'Not_Specified')


In [19]:
leads_df = leads_no_ID_nan
offers_df = offers_no_ID_nan

In [20]:
# Changing the type of variable of dates fields
offers_df["Created Date"] = pd.to_datetime(offers_df["Created Date"])
offers_df["Close Date"] = pd.to_datetime(offers_df["Close Date"])

In [21]:
# Create a new column to know how many days took the sales process
offers_df["Days"] = offers_df["Close Date"] - offers_df["Created Date"]

# Creating a column to know if there was applied a discount or not
offers_df["Discount"] = offers_df["Discount code"] == "Not_Specified"

In [22]:
# Changing the type of variable of dates fields for offers df
offers_df["Created Date"] = pd.to_datetime(offers_df["Created Date"])
offers_df["Close Date"] = pd.to_datetime(offers_df["Close Date"])

In [23]:
# Create a new column to know how many days took the sales process
offers_df["Days"] = offers_df["Close Date"] - offers_df["Created Date"]

# Creating a column to know if there was applied a discount or not
offers_df["Discount"] = offers_df["Discount code"] != "Not_Specified"

In [24]:
# Removing unnecessary columns
# "Created Date", "Close Date": already have a column to know how many days took the process
# "Discount code": Already have a column that specified if a code was used or not (the specific code isnt relevant)
# "Loss Reason": gives info of the target variable overfitting the model
offers_reduced = offers_df.drop(["Created Date", "Close Date", "Discount code", "Loss Reason"], axis=1)

In [25]:
# Encoding the columns Use Case and Pain of offers df
offers_encoded = pd.get_dummies(offers_reduced, columns=["Use Case","Pain"])

In [26]:
# fill the null values
offers_encoded["Price"] = offers_encoded["Price"].replace({"Not_Specified": "0"})
offers_encoded["Price"] = offers_encoded["Price"].astype(float).astype(int)

# Creating columns to know the days that took the process
offers_encoded["Days"] = offers_encoded["Days"].dt.days

In [27]:
# Transforms the converted variable to bool
leads_df.Converted = leads_df.Converted.astype('bool')

# Removing some unnecessary columns for the algorythm like name or reason why was discarded 
# Discarded reason is a value that is given after know that the person was not converted it wont be an input for this model
leads_reduced = leads_df.drop(["First Name", "Discarded/Nurturing Reason"], axis=1)

In [28]:
# encode some leads DF variables
leads_encoded = pd.get_dummies(leads_reduced, columns=["Use Case","Source"])

# Aplying binary encoded
# Initialize the encoder
encoder = BinaryEncoder(cols=["Acquisition Campaign", "City"])

# Fit and transform the data
leads_encoded = encoder.fit_transform(leads_encoded)

In [29]:
# Converts the date column into two columns, month and day to make it easier to process
leads_encoded['Created Date'] = pd.to_datetime(leads_encoded['Created Date'])
leads_encoded['month'] = leads_encoded['Created Date'].dt.month
leads_encoded['day'] = leads_encoded['Created Date'].dt.day
leads_encoded = leads_encoded.drop(['Created Date'], axis=1)

In [36]:
# Merging the dataframes
merged = pd.merge(leads_encoded, offers_encoded, on="Id")