In [228]:
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [103]:
data = pd.read_excel('data/Lapse_statistics.xlsx')

In [104]:
%%bash
ls data

Freq_statistics.xlsx
Lapse_statistics.xlsx
Sev_statistics.xlsx


In [105]:
data.head()

Unnamed: 0,Age,Area,Tenure,Premium_category,Lapses
0,31,2,114,4,0
1,18,2,10,6,1
2,62,2,136,1,0
3,45,2,48,5,0
4,22,2,58,6,1


In [106]:
data.describe()

Unnamed: 0,Age,Area,Tenure,Premium_category,Lapses
count,5000.0,5000.0,5000.0,5000.0,5000.0
mean,43.6738,1.518,70.7106,5.5516,0.357
std,14.919394,0.523575,41.388228,2.896449,0.479163
min,18.0,1.0,0.0,1.0,0.0
25%,31.0,1.0,35.0,3.0,0.0
50%,44.0,2.0,70.0,6.0,0.0
75%,56.0,2.0,106.0,8.0,1.0
max,70.0,3.0,143.0,10.0,1.0


In [107]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
Age                 5000 non-null int64
Area                5000 non-null int64
Tenure              5000 non-null int64
Premium_category    5000 non-null int64
Lapses              5000 non-null int64
dtypes: int64(5)
memory usage: 195.4 KB


In [108]:
y = data.Lapses
X = data.drop(['Lapses'], axis = 1)

In [109]:
C_space = np.geomspace(1e-4, 1e4, 9)


#for C in C_space:
model = LogisticRegressionCV(Cs = C_space, cv = 5, solver = 'liblinear', random_state = 57)
model.fit(X, y)
print(np.mean(model.scores_[1], axis= 0))

[0.6454 0.6518 0.6562 0.6586 0.66   0.6606 0.6608 0.6608 0.6608]


In [110]:
model = LogisticRegression(C = 1e2, solver = 'liblinear', random_state = 57, max_iter = 1e4)
model.fit(X, y)
print(model.intercept_, model.coef_)

[-0.85036654] [[ 0.00240754  0.0032532  -0.01476999  0.19859524]]


In [111]:
from scipy import stats
dataset = y - model.predict(X)
stats.describe(dataset)

DescribeResult(nobs=5000, minmax=(-1, 1), mean=0.1286, variance=0.3213263052610522, skewness=0.013895906956225506, kurtosis=-0.05116893685410284)

In [229]:
y = data.Lapses
X = data.drop(['Lapses'], axis = 1)

In [230]:
X.head()

Unnamed: 0,Age,Area,Tenure,Premium_category
0,31,2,114,4
1,18,2,10,6
2,62,2,136,1
3,45,2,48,5
4,22,2,58,6


In [231]:
cat_f = ['Area', 'Premium_category']
for cat in cat_f:
    for val in X[cat].value_counts().index:
        X[cat+'_'+str(val)] = X[cat].map(lambda x: 1 if x==val else 0)
    X = X.drop([cat], axis = 1)

In [232]:
X.head()

Unnamed: 0,Age,Tenure,Area_1,Area_2,Area_3,Premium_category_10,Premium_category_7,Premium_category_1,Premium_category_2,Premium_category_8,Premium_category_9,Premium_category_4,Premium_category_5,Premium_category_3,Premium_category_6
0,31,114,0,1,0,0,0,0,0,0,0,1,0,0,0
1,18,10,0,1,0,0,0,0,0,0,0,0,0,0,1
2,62,136,0,1,0,0,0,1,0,0,0,0,0,0,0
3,45,48,0,1,0,0,0,0,0,0,0,0,1,0,0
4,22,58,0,1,0,0,0,0,0,0,0,0,0,0,1


In [233]:
columns = X.columns
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=columns)
X.head()

Unnamed: 0,Age,Tenure,Area_1,Area_2,Area_3,Premium_category_10,Premium_category_7,Premium_category_1,Premium_category_2,Premium_category_8,Premium_category_9,Premium_category_4,Premium_category_5,Premium_category_3,Premium_category_6
0,0.25,0.797203,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.06993,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.846154,0.951049,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.519231,0.335664,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.076923,0.405594,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [234]:
X_pol = X.copy()
for a in X:
    for b in X:
        if not b+'*'+a in X_pol:
            X_pol[a+'*'+b] = X[a] * X[b]
            if a!=b:
                X_pol[a+'/'+b] = X[a] / (X[b] + 1e-7)

In [235]:
X_pol.head()

Unnamed: 0,Age,Tenure,Area_1,Area_2,Area_3,Premium_category_10,Premium_category_7,Premium_category_1,Premium_category_2,Premium_category_8,...,Premium_category_4/Premium_category_6,Premium_category_5*Premium_category_5,Premium_category_5*Premium_category_3,Premium_category_5/Premium_category_3,Premium_category_5*Premium_category_6,Premium_category_5/Premium_category_6,Premium_category_3*Premium_category_3,Premium_category_3*Premium_category_6,Premium_category_3/Premium_category_6,Premium_category_6*Premium_category_6
0,0.25,0.797203,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.06993,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.846154,0.951049,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.519231,0.335664,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,10000000.0,0.0,10000000.0,0.0,0.0,0.0,0.0
4,0.076923,0.405594,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [236]:
import statsmodels.api as sm
X_sm = sm.add_constant(X)

gamma_model = sm.GLM(y, X_sm, family=sm.families.Binomial())

  return ptp(axis=axis, out=out, **kwargs)


In [237]:
binom_results = gamma_model.fit()

In [238]:
binom_results.summary()

0,1,2,3
Dep. Variable:,Lapses,No. Observations:,5000
Model:,GLM,Df Residuals:,4986
Model Family:,Binomial,Df Model:,13
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2871.8
Date:,"Thu, 13 Jun 2019",Deviance:,5743.6
Time:,13:36:55,Pearson chi2:,4.71e+03
No. Iterations:,10,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2436,0.086,2.847,0.004,0.076,0.411
Age,0.1334,0.112,1.192,0.233,-0.086,0.353
Tenure,-2.1342,0.116,-18.443,0.000,-2.361,-1.907
Area_1,0.0056,0.084,0.067,0.947,-0.158,0.169
Area_2,-0.0005,0.083,-0.006,0.996,-0.162,0.162
Area_3,0.2385,0.214,1.116,0.264,-0.180,0.657
Premium_category_10,0.3464,0.089,3.908,0.000,0.173,0.520
Premium_category_7,0.5189,0.088,5.892,0.000,0.346,0.692
Premium_category_1,-1.2393,0.124,-10.023,0.000,-1.482,-0.997


In [239]:
binom_results.aic

5771.585918029552

In [240]:
binom_results.bic

-36723.13933437181

In [169]:
X_small = X_sm[['const', 'Tenure', 'Premium_category', 'Tenure/Premium_category',
                'Premium_category*Premium_category']]

In [227]:
binom_results.predict(X_sm)

0       0.181459
1       0.641512
2       0.051509
3       0.460432
4       0.468992
5       0.638738
6       0.371074
7       0.332313
8       0.286388
9       0.511185
10      0.295019
11      0.124422
12      0.362723
13      0.557458
14      0.472559
15      0.319784
16      0.110641
17      0.608205
18      0.239123
19      0.327612
20      0.646640
21      0.173691
22      0.679392
23      0.267368
24      0.575502
25      0.477419
26      0.353901
27      0.551976
28      0.189997
29      0.512761
          ...   
4970    0.453124
4971    0.354229
4972    0.335704
4973    0.661130
4974    0.670855
4975    0.245824
4976    0.065992
4977    0.575811
4978    0.562574
4979    0.693427
4980    0.415858
4981    0.208325
4982    0.244275
4983    0.264114
4984    0.152206
4985    0.398244
4986    0.649751
4987    0.156072
4988    0.114770
4989    0.506868
4990    0.718539
4991    0.060126
4992    0.095449
4993    0.644928
4994    0.545161
4995    0.123903
4996    0.537013
4997    0.3403

In [224]:
infl = gamma_model.get_influence()
sm_fr = infl.summary_frame()

AttributeError: 'GLM' object has no attribute 'get_influence'

In [225]:
gamma_model.estimate_tweedie_power()

TypeError: estimate_tweedie_power() missing 1 required positional argument: 'mu'

<h1>Test excel model</h1>

In [243]:
y = data.Lapses
X = data.drop(['Lapses'], axis = 1)

In [245]:
X.drop(['Age', 'Area'], axis = 1, inplace=True)
X['Tenure*Category'] = X.Tenure * X.Premium_category

<h2>Statsmodels</h2>

In [258]:
X_sm = sm.add_constant(X)

binom_model = sm.GLM(y, X_sm, family=sm.families.Binomial())
binom_results = binom_model.fit()
binom_results.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,Lapses,No. Observations:,5000
Model:,GLM,Df Residuals:,4996
Model Family:,Binomial,Df Model:,3
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-2905.7
Date:,"Thu, 13 Jun 2019",Deviance:,5811.4
Time:,16:36:20,Pearson chi2:,4.72e+03
No. Iterations:,5,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1776,0.134,-1.328,0.184,-0.440,0.084
Tenure,-0.0242,0.002,-12.336,0.000,-0.028,-0.020
Premium_category,0.1019,0.021,4.879,0.000,0.061,0.143
Tenure*Category,0.0015,0.000,5.404,0.000,0.001,0.002


In [259]:
binom_results.predict(X_sm)[:5]

0    0.138875
1    0.570694
2    0.040782
3    0.387250
4    0.393636
dtype: float64

In [264]:
binom_results.aic

5819.435292731287

<h2>Sklearn</h2>

In [260]:
model = LogisticRegression(C = 1e2, solver = 'liblinear', random_state = 57, max_iter = 1e5, tol=1e-6)
model.fit(X, y)
print(model.intercept_, model.coef_)

[-0.17716191] [[-0.02421478  0.10187181  0.00154698]]


In [263]:
pd.Series(model.predict_proba(X)[:5, 1])

0    0.138866
1    0.570708
2    0.040773
3    0.387262
4    0.393639
dtype: float64