In [1]:
import pandas as pd 
import numpy as np

In [2]:
#randomise some numbers
np.random.seed(42)

In [5]:
fastball_speed = np.random.randint(90, 106 , size = 500)


In [21]:
#creating random numbers 0 or 1 based on the fast_ball speed
high_speed_risk = np.random.choice([0, 1], p=[0.3, 0.7] , size= 500)
low_risk =0

tommy_john = np.where(fastball_speed > 96 , high_speed_risk , low_risk)

In [22]:
d = {
    'fastball_speed' :fastball_speed,
    'tommy_john' : tommy_john
}

In [23]:
df = pd.DataFrame(data = d)

In [13]:
df

Unnamed: 0,fastball_speed,tommy_john
0,95,0
1,104,1
2,90,0
3,98,1
4,90,0
...,...,...
495,94,0
496,97,1
497,97,1
498,92,0


In [24]:
X = df[['fastball_speed']]
y = df[['tommy_john']]

In [25]:
#normal train test split 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [26]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size= 0.2 , random_state=11)

In [27]:
lr = LogisticRegression()

In [28]:
lr.fit(X_train , y_train.values.ravel())

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [30]:
y_train.values.ravel().shape

(400,)

In [31]:
lr.score(X_test , y_test)

0.73

In [32]:
#Test agagin 
X2_train , X2_test , y2_train , y2_test = train_test_split(X , y , test_size= 0.2 , random_state=25)

In [33]:
lr.fit(X2_train , y2_train.values.ravel())

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [34]:
lr.score(X2_test, y2_test)

0.8

BASIC CROSS VALIDATION

In [35]:
from sklearn.model_selection import cross_val_score

In [None]:
#cross validation with k_folds = 10 
cvs = cross_val_score(lr,X , y.values.ravel() , cv = 10)

In [37]:
cvs

array([0.8 , 0.78, 0.82, 0.68, 0.82, 0.8 , 0.74, 0.72, 0.66, 0.78])

In [38]:
np.average(cvs)

np.float64(0.76)

In [39]:
np.std(cvs)

np.float64(0.05440588203494176)

Define Standard Kfold

In [40]:
from sklearn.model_selection import KFold

In [41]:
kf = KFold(n_splits= 15 , shuffle= True , random_state= 42)

In [44]:
kfscore = cross_val_score(lr , X , y.values.ravel() , cv = kf , scoring='f1')

In [45]:
kfscore2 = cross_val_score(lr , X , y.values.ravel() , cv = kf , scoring='accuracy')


In [46]:
kfscore

array([0.83333333, 0.58333333, 0.66666667, 0.77419355, 0.62068966,
       0.69230769, 0.47619048, 0.69565217, 0.57142857, 0.57142857,
       0.7       , 0.84615385, 0.63157895, 0.75862069, 0.60869565])

In [47]:
np.average(kfscore)

np.float64(0.6686848771675034)

In [48]:
np.std(kfscore)

np.float64(0.10020995248378999)

In [49]:
kfscore2

array([0.88235294, 0.70588235, 0.76470588, 0.79411765, 0.67647059,
       0.75757576, 0.66666667, 0.78787879, 0.63636364, 0.72727273,
       0.81818182, 0.87878788, 0.78787879, 0.78787879, 0.72727273])

In [50]:
np.average(kfscore2)

np.float64(0.7599524658348186)

In [51]:
np.std(kfscore2)

np.float64(0.06929872612355148)

Stratified K fold

In [52]:
from sklearn.model_selection import StratifiedKFold

In [53]:
kf3 = StratifiedKFold(n_splits= 10 , shuffle= True , random_state= 11 )

In [54]:
kfscore3 = cross_val_score(lr,X,y.values.ravel() , cv = kf3)

In [55]:
kfscore3

array([0.84, 0.84, 0.72, 0.74, 0.78, 0.7 , 0.64, 0.76, 0.78, 0.8 ])

In [56]:
np.average(kfscore3)

np.float64(0.76)

In [57]:
np.std(kfscore3)

np.float64(0.059329587896765304)

PIPELINE EXAMPLES

In [58]:
from sklearn.preprocessing import StandardScaler

In [59]:
scaler = StandardScaler()


In [60]:
from sklearn.pipeline import make_pipeline

In [61]:
pipe1 = make_pipeline (scaler ,lr)

In [62]:
pipe1.fit(X_train ,y_train.values.ravel())

0,1,2
,steps,"[('standardscaler', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [63]:
score_pipe = cross_val_score (pipe1 , X , y.values.ravel() ,cv = 10 )

In [64]:
score_pipe

array([0.8 , 0.78, 0.82, 0.68, 0.82, 0.8 , 0.74, 0.72, 0.66, 0.78])

In [65]:
np.average(score_pipe)

np.float64(0.76)

In [66]:
np.std(score_pipe)

np.float64(0.05440588203494176)