# Warmups

## Classification 12/16

In [17]:
from pydataset import data
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import time

In [129]:
data('voteincome', show_doc=True)

voteincome

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Sample Turnout and Demographic Data from the 2000 Current Population Survey

### Description

This data set contains turnout and demographic data from a sample of
respondents to the 2000 Current Population Survey (CPS). The states
represented are South Carolina and Arkansas. The data represent only a sample
and results from this example should not be used in publication.

### Usage

    data(voteincome)

### Format

A data frame containing 7 variables ("state", "year", "vote", "income",
"education", "age", "female") and 1500 observations.

`state`

a factor variable with levels equal to "AR" (Arkansas) and "SC" (South
Carolina)

`year`

an integer vector

`vote`

an integer vector taking on values "1" (Voted) and "0" (Did Not Vote)

`income`

an integer vector ranging from "4" (Less than \$5000) to "17" (Greater than
\$75000) denoting family income. See the CPS codebook for more info

In [130]:
df = data('voteincome')
df.head()

Unnamed: 0,state,year,vote,income,education,age,female
1,AR,2000,1,9,2,73,0
2,AR,2000,1,11,2,24,0
3,AR,2000,0,12,2,24,1
4,AR,2000,1,16,4,40,0
5,AR,2000,1,10,4,85,1


In [131]:
df.drop(columns=['state','year'], inplace = True)

In [132]:
train, test = train_test_split(df, stratify = df.vote)
x = df.drop(columns=['vote']).columns
y = 'vote'

In [133]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train[x], train[y])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [134]:
yhat = knn.predict(train[x])

In [135]:
(train[y] == yhat).sum()/len(train[y])

0.9751111111111112

In [136]:
print(classification_report(train[y], yhat))

              precision    recall  f1-score   support

           0       0.97      0.86      0.91       163
           1       0.98      0.99      0.99       962

    accuracy                           0.98      1125
   macro avg       0.97      0.93      0.95      1125
weighted avg       0.97      0.98      0.97      1125



In [137]:
yhat = knn.predict(test[x])
(test[y] == yhat).sum()/len(test[y])

0.8773333333333333

In [138]:
print(classification_report(test[y], yhat))

              precision    recall  f1-score   support

           0       0.58      0.54      0.56        54
           1       0.92      0.93      0.93       321

    accuracy                           0.88       375
   macro avg       0.75      0.74      0.74       375
weighted avg       0.87      0.88      0.88       375



###### Scaled Data

In [139]:
scaler = StandardScaler().fit(df[x])
df_scaled = pd.DataFrame(scaler.transform(df[x]), columns=x).join(df[y].reset_index())
df_scaled

Unnamed: 0,income,education,age,female,index,vote
0,-0.884952,-0.638144,1.359190,-1.126627,1,1
1,-0.374010,-0.638144,-1.446372,-1.126627,2,1
2,-0.118539,-0.638144,-1.446372,0.887605,3,0
3,0.903346,1.321356,-0.530270,-1.126627,4,1
4,-0.629481,1.321356,2.046266,0.887605,5,1
...,...,...,...,...,...,...
1495,1.158817,1.321356,0.214062,0.887605,1496,1
1496,-0.629481,-0.638144,-1.446372,-1.126627,1497,1
1497,-0.118539,0.341606,-0.186732,-1.126627,1498,1
1498,-0.118539,-1.617893,1.702728,0.887605,1499,0


In [140]:
train, test = train_test_split(df_scaled, stratify = df_scaled.vote)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train[x], train[y])
yhat = knn.predict(train[x])
print(classification_report(train[y], yhat))

              precision    recall  f1-score   support

           0       0.96      0.91      0.93       163
           1       0.99      0.99      0.99       962

    accuracy                           0.98      1125
   macro avg       0.97      0.95      0.96      1125
weighted avg       0.98      0.98      0.98      1125



In [143]:
yhat = knn.predict(test[x])
print(classification_report(test[y], yhat))

              precision    recall  f1-score   support

           0       0.42      0.46      0.44        54
           1       0.91      0.89      0.90       321

    accuracy                           0.83       375
   macro avg       0.67      0.68      0.67       375
weighted avg       0.84      0.83      0.84       375



In [142]:
df.vote.mean()

0.8553333333333333

In [144]:
from sklearn import __version__

1/6/2020

It's another day at the office at Big Research Co ™. You look up from your laptop and see a woman in a lab coat standing in front of your desk.

"I need some help" she says. "We lost some subjects from the trial."

She notices a curious look on your face. "Not like that, they just ran away. We didn't lock the doors soon enough."

"Anyway, there's probably like a 70%, no maybe 80%, no, let's say 90% chance that a given subject will stick around, and I need to run the study again with 10, or 20 subjects. We need to gather enough data on them to justify the cost, so I need you to figure out what are the probabilities are that at least half of them stick around, only 1 person leaves, and that all the subjects stay."

She sees you start to form another question and cuts you off.

"Don't ask. You really don't want to know."

- What probability distribution would you use to model the scenario outlined above?

> **Binomial**

- Calculate all the requested probabilities.

- Use all the possible combinations of subject count and chance that a subject will stay in the study. For example, at first calculate the chance that at least half of the subjects stay in the study if there is a 70% that each subject sticks around, and there are 10 subjects, then the probability that only one person leaves, then the probability that all the subjects stay.

- Bonus: visualize the requested probabilities.

<hr></hr>

**10 subjects**

In [29]:
from scipy.stats import binom

N = [10,20]
P = [.7,.8,.9]
for n in N:
    for p in P:
        dist = binom(n,p)
        #half stick around
        half = dist.sf(n/2)
        #one leaves
        one =  dist.pmf(n-1)
        #none leave
        none = dist.pmf(n)
        print(f''' 
        With {n} subjects, {p} probability of each leaving
        there is a {half} chance that half the subjects will leave
        there is a {one} chance that only one subject will leave
        there is a {none} chance that no subject will leave
        
        ''') 

 
        With 10 subjects, 0.7 probability of each leaving
        there is a 0.8497316674 chance that half the subjects will leave
        there is a 0.12106082100000007 chance that only one subject will leave
        there is a 0.02824752489999998 chance that no subject will leave
        
        
 
        With 10 subjects, 0.8 probability of each leaving
        there is a 0.9672065024000001 chance that half the subjects will leave
        there is a 0.26843545600000035 chance that only one subject will leave
        there is a 0.10737418240000005 chance that no subject will leave
        
        
 
        With 10 subjects, 0.9 probability of each leaving
        there is a 0.9983650626 chance that half the subjects will leave
        there is a 0.38742048900000037 chance that only one subject will leave
        there is a 0.34867844010000004 chance that no subject will leave
        
        
 
        With 20 subjects, 0.7 probability of each leaving
        there is a 0.9520

In [41]:
probs = []
N = [10,20]
P = [.7,.8,.9]
for n in N:
    for p in P:
        dist = binom(n,p)
        #half stick around
        half = dist.sf(n/2)
        #one leaves
        one =  dist.pmf(n-1)
        #none leave
        none = dist.pmf(n)
        probs.append({'p' : p, 'n' : n, 'half' : half, 'one' : one, 'none' : none})
        
df = pd.DataFrame(probs)
plt.bar()

Unnamed: 0,p,n,half,one,none
0,0.7,10,0.849732,0.121061,0.028248
1,0.8,10,0.967207,0.268435,0.107374
2,0.9,10,0.998365,0.38742,0.348678
3,0.7,20,0.952038,0.006839,0.000798
4,0.8,20,0.997405,0.057646,0.011529
5,0.9,20,0.999993,0.27017,0.121577
