## Create Sample GLM Model with Sample Dataset

In [1]:
# Import libraries:
import pandas as pd
import numpy as np
from math import log
import pickle as pk

pd.set_option('display.max_columns', 200)

In [2]:
# Read in data:
data0 = pd.read_excel('sample_training_data.xlsx')

### Data transformations

In [3]:
data = data0.copy()

# Create date-length variable:
data['date1'] = pd.to_datetime(data['date1'])
data['date2'] = pd.to_datetime(data['date2'])
data['date_diff'] = (data['date2'] - data['date1']).map(lambda x: x.days)

# Create new cat3_1D variable (simulates SIC-1D):
data['cat3_1D'] = data['cat3'].map(lambda x: str(x)[0], na_action = 'ignore')

# Handle dummy_rating:
data['dummy_rating_cat'] = np.where(data['dummy_rating'].str.contains('A'), 1, -1)
data['dummy_rating_cat'] = np.where(pd.isnull(data['dummy_rating']), np.nan, data['dummy_rating_cat'])

# Handle num1 and num5:
data['num1'] = data['num1'] - 1900
data['num5'] = data['num5'].map(lambda x: log(x + 1) if x > 0 else 0, na_action = 'ignore')
data['log_date_diff'] = data['date_diff'].map(lambda x: log(x + 1) if x > 0 else 0, na_action = 'ignore')

### Build GLM

In [4]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

  from pandas.core import datetools


In [5]:
# Isolate data down for GLM:
data_glm = data[['target_var', 'num1', 'num2', 'num5', 'binary1', 'binary2', 'log_date_diff', 'cat1', 'cat3_1D', 'dummy_rating_cat']].copy()

In [6]:
# Add constant:
data_glm = sm.add_constant(data_glm)

# Model build:
model = smf.glm("target_var ~ num1 + num2 + num5 + binary1 + binary2 + log_date_diff + C(cat1) + C(cat3_1D) + C(dummy_rating_cat)",
                family = sm.families.Poisson(link = sm.genmod.families.links.log), data = data_glm).fit()
model.summary()

0,1,2,3
Dep. Variable:,target_var,No. Observations:,1950.0
Model:,GLM,Df Residuals:,1932.0
Model Family:,Poisson,Df Model:,17.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-2554.8
Date:,"Tue, 10 Mar 2020",Deviance:,3832.9
Time:,20:02:48,Pearson chi2:,6920.0
No. Iterations:,25,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-13.5924,8.02e+04,-0.000,1.000,-1.57e+05,1.57e+05
C(cat1)[T.N],0.1679,1.61e+05,1.04e-06,1.000,-3.16e+05,3.16e+05
C(cat1)[T.P],23.9431,1.6e+05,0.000,1.000,-3.14e+05,3.14e+05
C(cat1)[T.R],25.7117,1.6e+05,0.000,1.000,-3.14e+05,3.14e+05
C(cat3_1D)[T.2],0.0927,0.167,0.555,0.579,-0.235,0.420
C(cat3_1D)[T.3],0.1230,0.177,0.694,0.488,-0.225,0.471
C(cat3_1D)[T.4],0.1232,0.190,0.649,0.516,-0.249,0.495
C(cat3_1D)[T.5],-0.0417,0.148,-0.282,0.778,-0.332,0.248
C(cat3_1D)[T.6],0.1761,0.147,1.196,0.232,-0.112,0.465


In [7]:
# Save model:
with open('Sample GLM Model/sample_glm.pkl', 'wb') as pickle_file:
    pk.dump(model, pickle_file)

### Try a prediction

In [8]:
col_order = ['num1', 'num2', 'num5', 'binary1', 'binary2', 'log_date_diff', 'cat1', 'cat3_1D', 'dummy_rating_cat']

rec = data_glm.iloc[3:4]
rec = rec[col_order]

In [9]:
rec_glm = sm.add_constant(rec)
model.predict(rec_glm)

3    0.548093
dtype: float64

In [12]:
data0.iloc[3:4].to_dict('records')

[{'binary1': 1,
  'binary2': 1,
  'binary3': 1,
  'cat1': 'R',
  'cat2': 'PP',
  'cat3': 7389.0,
  'date1': Timestamp('2018-02-12 00:00:00'),
  'date2': Timestamp('2019-02-12 00:00:00'),
  'dummy_name': '1Tv!5Ns+',
  'dummy_rating': 'A1',
  'num1': 2011.0,
  'num2': 21.0,
  'num3': 528.0,
  'num4': 80.0,
  'num5': 1431.0,
  'pca1': 1.2,
  'pca10': 1.1,
  'pca2': 1.94,
  'pca3': 1.88,
  'pca4': 1.61,
  'pca5': 1.72,
  'pca6': 1.79,
  'pca7': 1.62,
  'pca8': 1.24,
  'pca9': 1.44,
  'row_id': 4,
  'target_var': 0}]