# Monthly Buyer Likelihood model
___
Goal: To see if we can understand the variable list better and reduce the error rate for this problem statement

Methodology for First Pass:

Create an elastic net linear regression to help parry down the variable list that we start with and better understand a good baseline error rate
Run the field list through a logistic regression to set up baseline error rate to compare adding variables to.
Add in the variables 1 by 1, order all additive variables by how much they reduce the RMSE.


# Functions

In [1]:
import numpy as np
import pandas as pd

def cat_variable_level_check(trainx, testx):

    train_level_cols = trainx.columns[(trainx.dtypes == 'object') | (trainx.dtypes == 'category')]

    test_level_cols = testx.columns[(testx.dtypes == 'object') | (trainx.dtypes == 'category')]

    if set(train_level_cols) == set(test_level_cols):
        print("Categorical variable columns match, checking for missing levels within the train data set")
        
        for idx, val in enumerate(test_level_cols):
            replace_list = np.setdiff1d(testx[val].values.categories, trainx[val].values.categories)
            if all(replace_list == 0) == False:
                print("We are missing some levels either the test or train set for the column:", val, " Please fix before continuing.")
                
                return False
            else:
                print("All values match for the train and test categorical columns. Move to all column matching")
                
                return True
    else:
        print("The Categorical Variable set does not match, please check that the columns are the same")
        
        return False
    if set(trainx.columns) == set(testx.columns):
        print("All columns and levels match in the two data sets.")
        
        return True
    else:
        print("The columns in the train data set and test data set do not match, please go check.")
        
        return False

ModuleNotFoundError: No module named 'pandas'

In [2]:
def normalize_dataset(trainx, testx):
    
    if set(trainx.columns) == set(testx.columns):
        norm_trainx = pd.DataFrame().reindex_like(trainx)
        norm_testx = pd.DataFrame().reindex_like(testx)

        for idx, datatype in enumerate(trainx.dtypes):
            
            if datatype == 'int64':
                col_name = trainx.columns[idx]
                mean_train = trainx[col_name].mean()
                std_train = trainx[col_name].std()
                if std_train == 0:
                    norm_trainx[col_name] = trainx[col_name]
                    norm_testx[col_name] = testx[col_name]
                else:    
                    norm_trainx[col_name] = (trainx[col_name] - mean_train)/std_train
                    norm_testx[col_name] = (testx[col_name] - mean_train)/std_train
            
            if datatype == 'float64':
                col_name = trainx.columns[idx]
                mean_train = trainx[col_name].mean()
                std_train = trainx[col_name].std()
                if std_train == 0:
                    norm_trainx[col_name] = trainx[col_name]
                    norm_testx[col_name] = testx[col_name]
                else:    
                    norm_trainx[col_name] = (trainx[col_name] - mean_train)/std_train
                    norm_testx[col_name] = (testx[col_name] - mean_train)/std_train
            
            else:
                col_name = trainx.columns[idx]
                norm_trainx[col_name] = trainx[col_name]
                norm_testx[col_name] = testx[col_name]
                
    elif set(trainx.columns) != set(testx.columns):
        print("Columns are not the same between train and test dataset, please make sure the columns match")
                
    return norm_trainx, norm_testx

In [3]:
import pickle

def logreg_create_model_image(trainx, trainy, save, model_name='None'):
    
    logreg = LogisticRegression(
        penalty = 'elasticnet',
        max_iter = 1000
    )
    ohe_trainx =  pd.get_dummies(trainx)
    logreg.fit(ohe_trainx, trainy)
    
    if save == True:
        path = '/Users/agoyal/Documents/Projects/model_images/'
        filename = path + model_name
        file = open(filename, 'wb')
        pickle.dump(logreg, file)
        file.close()
        
    return print("file saved: model name is " + model_name)

In [4]:
from sklearn.preprocessing import normalize
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
import math

def logreg_coefs_and_test_rmse(trainx, trainy, testx, testy):
    
    norm_trainx, norm_testx = normalize_dataset(trainx, testx)
    
    logreg = LogisticRegression(
        penalty = 'l2',
        max_iter = 1000
    )
    
    ohe_trainx =  pd.get_dummies(trainx)
    ohe_testx =  pd.get_dummies(testx)
    logreg.fit(ohe_trainx, trainy)
    coeff_output = {"Feature": ohe_trainx.columns, "estCoeff": logreg.coef_.tolist()[0], "Magnitude": abs(logreg.coef_).tolist()[0]}

    coeffs = pd.DataFrame(coeff_output).sort_values("Magnitude", ascending = 0)
    
    pred_df = pd.DataFrame(testy).reset_index(drop=True)

    pred_df.columns = ['act_y']
    
    pred_df['pred_y'] = pd.DataFrame(logreg.predict_proba(ohe_testx))[1]

    pred_df['error_sq'] = (pred_df['pred_y'] - pred_df['act_y']) * (pred_df['pred_y'] - pred_df['act_y'])
    
    lm_rmse = math.sqrt((pred_df['error_sq'].sum()/pred_df['error_sq'].size))

    return coeffs, lm_rmse, pred_df

ModuleNotFoundError: No module named 'sklearn'

In [5]:
from sklearn.linear_model import ElasticNet
import math

def elastic_net_coefs_and_test_rmse(trainx, trainy, testx, testy, alpha = .5, l1_ratio = .5, max_iter = 1000, tol = .0001):
    
    norm_trainx, norm_testx = normalize_dataset(trainx, testx)
    
    regr = ElasticNet(
        alpha = alpha,
        random_state = 0,
        copy_X = True,
        fit_intercept = True,
        l1_ratio = l1_ratio,
        max_iter = max_iter,
        normalize = False,
        positive = False,
        precompute = False,
        selection = 'cyclic',
        tol = tol,
        warm_start = False
    )
    
    ohe_trainx =  pd.get_dummies(trainx)
    ohe_testx =  pd.get_dummies(testx)
    
    
    regr.fit(ohe_trainx, trainy)
    coeff_output = {"Feature": ohe_trainx.columns, "estCoeff": regr.coef_.tolist(), "Magnitude": abs(regr.coef_).tolist()}

    coeffs = pd.DataFrame(coeff_output).sort_values("Magnitude", ascending = 0)
    
    pred_df = pd.DataFrame(testy).reset_index(drop=True)

    pred_df.columns = ['act_y']
    
    pred_df['pred_y'] = pd.DataFrame(regr.predict_proba(ohe_testx))[1]

    pred_df['error_sq'] = (pred_df['pred_y'] - pred_df['act_y']) * (pred_df['pred_y'] - pred_df['act_y'])
    
    en_rmse = math.sqrt((pred_df['error_sq'].sum()/pred_df['error_sq'].size))

    return coeffs, en_rmse, pred_df

ModuleNotFoundError: No module named 'sklearn'

# Code to use these functions

```
pl_df_train = pl_df[pl_df['current_month'] < '2020-06-01']
pl_df_test = pl_df[pl_df['current_month'] >= '2020-06-01']
```

## Step 1: Variable list for model

Will add all fields within the elastic net model to widdle down the field list
Once we have the fields that seem to make the most difference, we can start there and add fields in a more controlled manner to see what actually decreases the RMSE/improves the AUC and test set predictions


In [6]:
field_list = [
    'growth_segment',
    'gender_class',
    'mon_cur',
    'months_since_join',
    'months_since_last_purchase',
    'sellers_bought_from',
    'pm_sellers_bought_from',
    'py_sellers_bought_from',
    'offers',
    'pm_offers',
    'py_offers',
    'comp_offers',
    'pm_comp_offers',
    'py_comp_offers',
    'orders',
    'pm_orders',
    'py_orders',
    'gmv',
    'pm_gmv',
    'py_gmv',
    'listings_viewed',
    'pm_listings_viewed',
    'brand_browses',
    'pm_brand_browses',
    'blocked_by',
    'pm_blocked_by',
    'blocked',
    'pm_blocked',
    'users_followed',
    'pm_users_followed',
    'days_active',
    'pm_days_active',
    'likes',
    'pm_likes',
    'comments',
    'pm_comments'
]

y_var = ['y_var']

### Split data into train and test

```
pl_df_train_x = pl_df_train[field_list]
pl_df_train_y = pl_df_train[y_var]

pl_df_test_x = pl_df_test[field_list]
pl_df_test_y = pl_df_test[y_var]
```


### Run first Elastic Net to get base model

```
ohe_train_x = pd.get_dummies(pl_df_train_x)

from sklearn.linear_model import ElasticNet
import math

regr = ElasticNet(
    alpha = .5,
    random_state = 0,
    copy_X = True,
    fit_intercept = True,
    l1_ratio = .5,
    max_iter = 1000,
    normalize = False,
    positive = False,
    precompute = False,
    selection = 'cyclic',
    tol = .0001,
    warm_start = False
)

regr.fit(ohe_train_x, pl_df_train_y)
```

### Sort coefficients for the model
```
coeff_output = {"Feature": ohe_train_x.columns, "estCoeff": regr.coef_.tolist(), "Magnitude": abs(regr.coef_).tolist()}

coeffs = pd.DataFrame(coeff_output).sort_values("Magnitude", ascending = 0)
coeffs = coeffs[coeffs['estCoeff'] > 0]


coeffs['Feature'].tolist()
```

## Step 2: First Iteration of Model
___
This should increase the efficacy of the base model and make it easier to understand what variables will actually reduce the error rate over time

### take the first logistic regression based on the base variables

```
logreg_field_list = coeffs['Feature'].tolist()

pl_df_train_x = pl_df_train[logreg_field_list]
pl_df_train_y = pl_df_train[y_var]

pl_df_test_x = pl_df_test[logreg_field_list]
pl_df_test_y = pl_df_test[y_var]

coefs, rmse, pred_df = logreg_coefs_and_test_rmse(pl_df_train_x, pl_df_train_y, pl_df_test_x, pl_df_test_y)

pred_df = pred_df.sort_values("pred_y", ascending = 0).reset_index(drop = True)
pred_df['index'] = pred_df.index
pred_df['Quantile'] = pd.qcut(pred_df['index'], q = 10)

pred_df.groupby('Quantile').agg(
    {
        'act_y':'sum',
        'pred_y':'sum'
    }
).reset_index(drop = True)
```

## Step 3: Test addition of other variables

### Set up for running the loop
```
all_cols = pl_df_train.columns.tolist()
add_cols = [x for x in all_cols if x not in logreg_field_list]
add_cols = [x for x in add_cols if x not in ['y_var', 'current_month', 'y2_var']]


tb_results = pd.DataFrame(
    columns=[
        'feature', 
        'rmse',
        'diff_rmse'
    ]
)

tb_results = tb_results.append(
    {
        'feature': 'base',
        'rmse': rmse,
        'diff_rmse': 0
    },
    ignore_index = True
)

base_rmse = rmse
```

### Loop to test each additional variable

```
for i in add_cols:
    print("adding the following variable:", i)
    model_field_list = logreg_field_list
    model_field_list = model_field_list + [i]
    
    pl_df_train_x = pl_df_train[model_field_list]
    pl_df_train_y = pl_df_train[y_var]

    pl_df_test_x = pl_df_test[model_field_list]
    pl_df_test_y = pl_df_test[y_var]
    
    
    coefs, rmse, pred_df = logreg_coefs_and_test_rmse(pl_df_train_x, pl_df_train_y, pl_df_test_x, pl_df_test_y)
    
    tb_results = tb_results.append(
        {
            'feature': i,
            'rmse': rmse,
            'diff_rmse': base_rmse - rmse
        },
        ignore_index = True
    )
    
    print("appended result for the following variable:", i)
```

### See results:

```
tb_results = tb_results.sort_values(by = 'diff_rmse', ascending = False)

tb_results
```

## Step 4: Add in additional Variable list
___
Goal: Add in a list of variables that improve the base model RMSE

* This list of variables we can pull from the initial list
* any feature which has a positive diff RMSE should be added

### Run the model with all the variables to see if that helps:

```
tb_results['feature'][(tb_results['diff_rmse'] > 0) & (tb_results['feature'] != 'y2_var')].to_list()

model_field_list = logreg_field_list
new_var_list = tb_results['feature'][(tb_results['diff_rmse'] > 0) & (tb_results['feature'] != 'y2_var')].to_list()
model_field_list = model_field_list + new_var_list

pl_df_train_x = pl_df_train[model_field_list]
pl_df_train_y = pl_df_train[y_var]

pl_df_test_x = pl_df_test[model_field_list]
pl_df_test_y = pl_df_test[y_var]


coefs, rmse, pred_df = logreg_coefs_and_test_rmse(pl_df_train_x, pl_df_train_y, pl_df_test_x, pl_df_test_y)

tb_results = tb_results.append(
    {
        'feature': "all_vars",
        'rmse': rmse,
        'diff_rmse': base_rmse - rmse
    },
    ignore_index = True
)

tb_results = tb_results.sort_values(by = 'diff_rmse', ascending = False)
tb_results[feature = 'all_vars']

```

### Look at the quantile plot for all variables:

```

pred_df = pred_df.sort_values("pred_y", ascending = 0).reset_index(drop = True)
pred_df['index'] = pred_df.index
pred_df['Quantile'] = pd.qcut(pred_df['index'], q = 10)

pred_df.groupby('Quantile').agg(
    {
        'act_y':'sum',
        'pred_y':'sum'
    }
).reset_index(drop = True)
```