# Exploring the functionality within the `romeo.LogisticRegression` class

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import log_loss, roc_curve, roc_auc_score, confusion_matrix
import session_info

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
current_wd = os.getcwd()
os.chdir(current_wd.split("/notebook")[0])
from romeo.logistic_model import LogisticRegression
os.chdir(current_wd)

In [3]:
session_info.show(write_req_file=False)

Fail to get yarn configuration. dyld: Library not loaded: /usr/local/opt/icu4c/lib/libicui18n.67.dylib
  Referenced from: /usr/local/bin/node
  Reason: image not found



## Make X and y data

In [4]:
amount_of_features = 10
amount_informative = 2
X, y = make_classification(n_samples=300,
    n_features=amount_of_features,
    n_informative=amount_informative,
#                              noise=5,
#                              coef=True, 
                            random_state=42)

In [5]:
# X = X + 100

## Fit the model

In [6]:
df = (pd.DataFrame(X, columns=[f"feat_{x}" for x in range(0, X.shape[1])])
.merge(pd.DataFrame(y, columns=["target"]),
      left_index=True,
      right_index=True))

In [7]:
df.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,target
0,-0.025,1.452,-1.203,1.248,-0.508,0.923,0.3,-0.453,-0.148,0.327,1
1,0.75,0.098,0.664,-0.68,0.254,-0.339,-0.392,1.086,0.468,0.308,0
2,-1.236,-0.839,-0.903,0.922,0.538,0.411,-1.966,-0.365,1.073,-1.045,1
3,-1.615,-0.756,-0.722,0.729,-0.668,0.183,-0.898,-0.175,0.992,0.537,1
4,-0.724,0.633,0.952,-0.997,-0.832,-0.903,-1.516,-0.552,0.471,0.203,0


In [8]:
df_train = df.sample(frac=0.8, random_state=69420)
df_test = df.drop(df_train.index)

In [9]:
df_train.shape, df_test.shape

((240, 11), (60, 11))

## Fit the romeo LogisticRegression

In [10]:
reg = LogisticRegression(fit_intercept=True,
                      normalize=True).fit(X=df_train.filter(regex="feat"), 
                                               y=df_train["target"], 
                                               verbose=False)



In [11]:
reg.intercept_, reg.coef_ 

(0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [12]:
reg.summary_.round(3)
#reg.score(X, y)

AttributeError: 'LogisticRegression' object has no attribute 'summary_'

In [None]:
# reg.fit_evaluation_.round(3)

## Show the available methods and attributes of the fitted model

In [None]:
np.array(dir(reg))

In [None]:
# reg.conf_int()

In [None]:
reg.df_resid

In [None]:
intercept = reg.intercept_
coefs = reg.coef_

In [None]:
coefs

In [None]:
coefs.reshape(1,-1)

In [None]:
coefs*df_train.filter(regex="feat").head()

In [None]:
tmp_x = df_train.filter(regex="feat").assign(intercept=1).set_index("intercept").reset_index().values

In [None]:
tmp_x = reg.intercept_ * tmp_x

In [None]:
# np.linalg.inv(np.dot(tmp_x.T, tmp_x))

In [None]:
cov_mat = np.linalg.inv(np.matmul(tmp_x.transpose(1,0), tmp_x))

In [None]:
# abs(cov_mat)

In [None]:
# np.sqrt(np.diag(abs(cov_mat)))

In [None]:
# pd.DataFrame(np.cov(tmp_x))

In [None]:
# Statsmodels cov_params
"""
Notes
-----
(The below are assumed to be in matrix notation.)
If no argument is specified returns the covariance matrix of a model
``(scale)*(X.T X)^(-1)``
If contrast is specified it pre and post-multiplies as follows
``(scale) * r_matrix (X.T X)^(-1) r_matrix.T``
If contrast and other are specified returns
``(scale) * r_matrix (X.T X)^(-1) other.T``
If column is specified returns
``(scale) * (X.T X)^(-1)[column,column]`` if column is 0d
OR
``(scale) * (X.T X)^(-1)[column][:,column]`` if column is 1d
"""

## Plot the original data and the lines of best fit for each of the predictors on the target

In [None]:
_ = plt.figure(figsize=(12, 7))
_ = plt.plot(df_train.filter(regex="feat"), 
             df_train["target"], 
             'o', 
             label='original data')
_ = plt.plot(df_train.filter(regex="feat"), 
             intercept + coefs*df_train.filter(regex="feat"), 
             'r', 
             label='fitted line')
# _ = plt.legend()

## With the fitted model attempt to predict the test data

In [None]:
y_pred = reg.predict(df_test.filter(regex="feat"))
y_pred.head(2)

In [None]:
y_test = df_test[["target"]]
y_test.head(2)

## Calculate the Log loss between the predicted vs the real scores

In [None]:
log_loss_score = log_loss(y_test["target"], 
        y_pred)
log_loss_score

## Plot the predicted vs the real values

In [None]:
c_mat = confusion_matrix(y_test["target"], y_pred.round(0))

In [None]:
cmap='Blues'
categories=["0", "1"]

_ = sns.heatmap(c_mat, 
                annot=True,            
#                 fmt="",
                cmap=cmap,
#                 cbar=cbar,
                xticklabels=categories,
                yticklabels=categories
               )
_ = plt.title("Confusion Matrix")
# _ = plt.xlabel(f'Predicted value\n\nAccuracy={summary_df["Accuracy"].values[0]}\nSensitivity={summary_df["Sensitivity"].values[0]}\nSpecificity={summary_df["Specificity"].values[0]}\nF1 Score={summary_df["F1"].values[0]}')
_ = plt.xlabel(f'Predicted value')
_ = plt.ylabel("True value")

In [None]:
# _ = plt.figure(figsize=(12, 7))
# _ = plt.plot(y_test["target"], 
#              y_pred, 
#              'o', 
# #              label='original data'
#             )
# _ = plt.annotate(text = f"Log Loss = {round(log_loss, 5)}",
#                  xy=(0.9, 0.1),)

In [None]:
fpr, tpr, thresh = roc_curve(y_true=y_test["target"], y_score=y_pred, pos_label=1, sample_weight=None, drop_intermediate=False)

In [None]:
auc_score = roc_auc_score(y_true=y_test["target"], y_score=y_pred)

In [None]:
_ = plt.figure(figsize=(12, 7))
_ = plt.plot(fpr, 
             tpr, 
#              'o', 
            )
_ = plt.plot(fpr, 
             tpr, 
             'o',
             c="orange"
            )

_ = plt.annotate(text = f"AUC = {round(auc_score, 5)}",
                 xy=(0.9, 0.1),)

# Statsmodels

In [None]:
import statsmodels
# from statsmodels.regression.linear_model import OLS
from statsmodels.discrete.discrete_model import Logit
from scipy import stats

In [None]:
# df_train["target"]
# statsmodels.tools.tools.add_constant(data=df_train.filter(regex="feat"), prepend=True, has_constant='skip')

In [None]:
res = Logit(endog=df_train["target"], exog=statsmodels.tools.tools.add_constant(data=df_train.filter(regex="feat"), prepend=True, has_constant='skip')).fit()
# res = Logit(endog=df_train["target"], exog=df_train.filter(regex="feat"), prepend=True, has_constant='skip').fit()

In [None]:
coefs

In [None]:
res.params

In [None]:
# def bse(self):
#         return np.sqrt(np.diag(self.cov_params()))

In [None]:
np.round(res.bse, 3)

In [None]:
np.sqrt(np.diag(res.cov_params()))

In [None]:
res.cov_params()

In [None]:
np.array(dir(res))

In [None]:
res.summary()

In [None]:
reg.summary_.round(3)

In [None]:
y_pred = res.predict(exog=statsmodels.tools.tools.add_constant(data=df_test.filter(regex="feat"), prepend=True, has_constant='skip'))

## Plot the predicted vs the real values

In [None]:
c_mat = confusion_matrix(y_test["target"], y_pred.round(0),)

In [None]:
cmap='Blues'
categories=["0", "1"]

_ = sns.heatmap(c_mat, 
                annot=True,            
#                 fmt="",
                cmap=cmap,
#                 cbar=cbar,
                xticklabels=categories,
                yticklabels=categories
               )
_ = plt.title("Confusion Matrix")
# _ = plt.xlabel(f'Predicted value\n\nAccuracy={summary_df["Accuracy"].values[0]}\nSensitivity={summary_df["Sensitivity"].values[0]}\nSpecificity={summary_df["Specificity"].values[0]}\nF1 Score={summary_df["F1"].values[0]}')
_ = plt.xlabel(f'Predicted value')
_ = plt.ylabel("True value")

In [None]:
log_loss_score = log_loss(y_test["target"], 
        y_pred)
log_loss_score

In [None]:
# _ = plt.figure(figsize=(12, 7))
# _ = plt.plot(y_test["target"], 
#              y_pred, 
#              'o', 
# #              label='original data'
#             )
# _ = plt.annotate(text = f"Log Loss = {round(log_loss, 5)}",
#                  xy=(0.9, 0.1),)

In [None]:
fpr, tpr, thresh = roc_curve(y_true=y_test["target"], y_score=y_pred, pos_label=1, sample_weight=None, drop_intermediate=False)

In [None]:
auc_score = roc_auc_score(y_true=y_test["target"], y_score=y_pred)

In [None]:
_ = plt.figure(figsize=(12, 7))
_ = plt.plot(fpr, 
             tpr, 
#              'o', 
            )
_ = plt.plot(fpr, 
             tpr, 
             'o',
             c="orange"
            )

_ = plt.annotate(text = f"AUC = {round(auc_score, 5)}",
                 xy=(0.9, 0.1),)

In [None]:
reg.fit_evaluation_.round(4)

In [None]:
stop!

In [None]:
res.llf

In [None]:
res.params / res.bse

In [None]:
res.tvalues

In [None]:
res.pvalues

In [None]:
res.df_resid

In [None]:
stats.t.sf(np.abs(res.tvalues), res.df_resid) * 2

In [None]:
type(res.summary())

In [None]:
res.conf_int()

In [None]:
res.rsquared

In [None]:
res.ssr

In [None]:
res.resid

In [None]:
reg.resid