# Predicting Job change of data scientists

## Loading the dataset and Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier

import bentoml

In [2]:
df = pd.read_csv("Data/churn.csv")

In [3]:
## Unnamed:0 has to be dropped, security_number (unique values), joining date (we don't know the date
## data was published.),  
df = df.drop(columns=["Unnamed: 0", "referral_id", "security_no", "joining_date"])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36992 entries, 0 to 36991
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           36992 non-null  int64  
 1   gender                        36992 non-null  object 
 2   region_category               31564 non-null  object 
 3   membership_category           36992 non-null  object 
 4   joined_through_referral       36992 non-null  object 
 5   preferred_offer_types         36704 non-null  object 
 6   medium_of_operation           36992 non-null  object 
 7   internet_option               36992 non-null  object 
 8   last_visit_time               36992 non-null  object 
 9   days_since_last_login         36992 non-null  int64  
 10  avg_time_spent                36992 non-null  float64
 11  avg_transaction_value         36992 non-null  float64
 12  avg_frequency_login_days      36992 non-null  object 
 13  p

### Categorical and Numerical Columns

In [4]:
categorical_columns = list(df.dtypes[df.dtypes == "object"].index)
numerical_columns = [col for col in df.columns if col not in categorical_columns and col != 'churn_risk_score']

In [5]:
###cleaning up spaces in categorical varibles
for col in categorical_columns:
    df[col] = df[col].str.lower().str.replace(" ", "_")

### Missing and Unknown Values

#### Cleaning up `Unknown` datapoints

In [6]:
low_cat = []
for col in categorical_columns :
    if df[col].nunique() < 20:
        low_cat.append(col)

*joined_through_referral and medium_of_operation have ? in their values*

The question mark will be changed to Null Values

In [7]:
for col in ['medium_of_operation', 'joined_through_referral']:
    df[col] = df[col].replace('?', np.NaN)

<b>`Gender`<b>

Unknown will bereplaced with Null values

In [8]:
df["gender"]=df["gender"].replace("unknown", np.NaN)

<b>`days_since_last_login`<b>

-999 looks weird so that will be changed to a null value

In [9]:
df["days_since_last_login"]=df["days_since_last_login"].replace(-999, np.NaN)

<b>`Average time spent` and `Points in Wallet`<b>

There are values lower than 0 in the both columns which does not make sense based on data context

In [12]:
for col in ['avg_time_spent', 'points_in_wallet']:
    df[col]=df[col].apply(lambda x:x if x>=0 else np.nan)

<b>`Average Frequency Login days`<b>

In [13]:
df['avg_frequency_login_days']=df['avg_frequency_login_days'].apply(lambda x:x if x!='error' else -1)
df['avg_frequency_login_days']=df['avg_frequency_login_days'].astype('float')
df['avg_frequency_login_days']=df['avg_frequency_login_days'].apply(lambda x:x if x>=0 else np.nan)

In [14]:
df["avg_frequency_login_days"]= pd.to_numeric(df["avg_frequency_login_days"], 
                                             errors='coerce')



In [15]:
categorical_columns = list(df.dtypes[df.dtypes == "object"].index)
numerical_columns = [col for col in df.columns if col not in categorical_columns and col != 'churn_risk_score']

In [16]:
for feature in numerical_columns:
    df[feature]= pd.to_numeric(df[feature], errors='coerce')

## Separate dataset into train and test

Best on practice, We have to split our dataset to Separate dataset into train and test. It is important to separate our data intro training and testing set.

Before we engineer features, some techniques learn parameters from data. It is important to learn these parameters only from the train set. This is to avoid over-fitting.

In [17]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [18]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


In [19]:
y_train = df_train.churn_risk_score
y_val = df_val.churn_risk_score
y_test = df_test.churn_risk_score

In [20]:
del df_train ["churn_risk_score"]
del df_val ["churn_risk_score"]
del df_test["churn_risk_score"]

## Feature Engineering

Feature engineering involves changing or manipulating data in columns of data. Splitting before engineering in avoids over fitting.

### Replacing Null Values

<b>`Categorical columns with missing data`<b>

In [21]:
cat_col_na = [col for col in categorical_columns
             if df[col].isnull().sum() > 0]

In [22]:
# variables to impute with the string missing
with_string_missing = [
    col for col in cat_col_na if df[col].isnull().mean() > 0.1]

# variables to impute with the most frequent category
with_frequent_category = [
    col for col in cat_col_na if df[col].isnull().mean() < 0.1]

In [23]:
# replace missing values with new label: "Missing"

df_full_train[with_string_missing] = df_full_train[with_string_missing].fillna('missing')
df_train[with_string_missing] = df_train[with_string_missing].fillna('missing')
df_val[with_string_missing] = df_val[with_string_missing].fillna('missing')
df_test[with_string_missing] = df_test[with_string_missing].fillna('missing')


In [24]:
## Mode is only gotten from training dataset. Replace with the mode (first value) if there are multiple modes

for col in with_frequent_category:
    
    mode = df_train[col].mode()[0]
    
    df_full_train[col].fillna(mode, inplace=True)
    df_train[col].fillna(mode, inplace=True)
    df_val[col].fillna(mode, inplace=True)
    df_test[col].fillna(mode, inplace=True)


<b>`Numerical columns with missing data`<b>

In [25]:
num_with_na = [col for col in numerical_columns 
               if df[col].isnull().sum() > 0]

In [26]:
for col in numerical_columns:
    
    median = df_train[col].median()
    
    df_full_train[col].fillna(median, inplace=True)
    df_train[col].fillna(median, inplace=True)
    df_val[col].fillna(median, inplace=True)
    df_test[col].fillna(median, inplace=True)

## Using the best model - Decision Tree

The project_EDA file contains comparing the different models that can be used. The best was picked and is used here as the model.

### Training the model

In [30]:
dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train, y_train)

## BENTOML

In [140]:
# `save` a given classifier and retrieve coresponding tag:
tag = bentoml.sklearn.save_model('decision_tree', dt, 
                                 custom_objects = {
                                     "dictVectorizer":dv
                                 }, signatures={
                                    "predict_proba":{
                                        "batchable": True,
                                        "batch_dim": 0
                                    }})

In [141]:
print(tag)

Model(tag="decision_tree:no7cy7s5w6cpvshc", path="C:\Users\eddye\bentoml\models\decision_tree\no7cy7s5w6cpvshc\")