In [1]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# !pip install mord
from mord import LogisticIT
import matplotlib.pylab as plt
import seaborn as sns

# !pip install dmba
from dmba import classificationSummary, gainsChart, liftChart
from dmba.metric import AIC_score


  from pandas import Int64Index as NumericIndex


In [2]:
mowers = pd.read_csv("RidingMowers.csv")
mowers.tail()

Unnamed: 0,Income,Lot_Size,Ownership
19,66.0,18.4,Nonowner
20,47.4,16.4,Nonowner
21,33.0,18.8,Nonowner
22,51.0,14.0,Nonowner
23,63.0,14.8,Nonowner


## 1. What percentage of households in the study were owners of a riding mower? 

In [3]:
mowers.groupby(['Ownership']).size()/len(mowers)

Ownership
Nonowner    0.5
Owner       0.5
dtype: float64

**50% of households in the study were owners of a riding mower**

## 2. Use all the data to fit a logistic regression of ownership on the two predictors. Remember to create dummy variables, if appropriate.

In [4]:
X = mowers.drop(columns='Ownership')
y = pd.get_dummies(mowers.Ownership,drop_first=True)

train_X, valid_X, train_y, valid_y = train_test_split(X,y, test_size=0.4, random_state=1)

In [5]:
logit_reg = LogisticRegression(solver='liblinear', C=1e42, random_state=1)
logit_reg.fit(train_X, train_y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1e+42, random_state=1, solver='liblinear')

In [6]:
logit_reg.intercept_

array([-15.43224502])

In [7]:
print(pd.DataFrame({'coef': logit_reg.coef_[0]}, index=X.columns))

              coef
Income    0.085550
Lot_Size  0.510334


## 3. Among nonowners, what is the percentage of households classified correctly? 

In [8]:
logit_reg_prob = logit_reg.predict_proba(valid_X)
logit_reg_pred = logit_reg.predict(valid_X)
logit_result = pd.DataFrame({'actual' : valid_y.Owner,
                             'p_0' : [p[0] for p in logit_reg_prob],
                             'p_1' : [p[1] for p in logit_reg_prob],
                             'predicted': logit_reg_pred})
logit_result

Unnamed: 0,actual,p_0,p_1,predicted
13,0,0.574509,0.425491,0
18,0,0.898917,0.101083,0
3,1,0.390786,0.609214,1
14,0,0.752292,0.247708,0
20,0,0.952921,0.047079,0
17,0,0.903899,0.096101,0
10,1,0.460547,0.539453,1
4,1,0.017049,0.982951,1
2,1,0.243314,0.756686,1
19,0,0.597682,0.402318,0


In [9]:
classificationSummary(valid_y, logit_reg.predict(valid_X))

Confusion Matrix (Accuracy 1.0000)

       Prediction
Actual 0 1
     0 6 0
     1 0 4


**All nonowners are classified correctly**

## 4. What is the classification of a household with a $60K income and a lot size of 20,000 ft2? Use cutoff = 0.5. 


In [10]:
logit_reg.predict(np.array([[60,20]]))



array([0], dtype=uint8)

## 5. What are the odds that a household with a $60K income and a lot size of 20,000 ft2 is an owner? 

In [11]:
prob = logit_reg.predict_proba(np.array([[60,20]]))
prob[0,1]/prob[0,0]



0.9115760528678523