Statistics for Machine Learning
By: Pratap Dangeti
Publisher: Packt Publishing
Pub. Date: July 21, 2017
Web ISBN-13: 978-1-78829-122-4
Print ISBN-13: 978-1-78829-575-8
Pages in Print Edition: 442       

In [1]:
import os
import time

""" First change the following directory link to where all input files do exist """
os.chdir("C:\dls\Machine Learning\data")

In [2]:
import numpy as np
from scipy import stats

startTime = time.time()
data = np.array([4,5,1,2,7,2,6,9,3])

# Calculate Mean
dt_mean = np.mean(data) ; print ("Mean :",round(dt_mean,2))
              
# Calculate Median                 
dt_median = np.median(data) ; print ("Median :",dt_median)        

# Calculate Mode                     
dt_mode =  stats.mode(data); print ("Mode :",dt_mode[0][0])  
endTime = time.time()
print('This took %s seconds to calculate.' % (endTime - startTime))

Mean : 4.33
Median : 4.0
Mode : 2
This took 0.0010013580322265625 seconds to calculate.


In [3]:
# Deviance calculations

import numpy as np
from statistics import variance,stdev

startTime = time.time()
game_points = np.array([35,56,43,59,63,79,35,41,64,43,93,60,77,24,82])

# Calculate Variance
dt_var = variance(game_points) ; print ("Sample variance:", round(dt_var,2))

# Calculate Standard Deviation
dt_std = stdev(game_points) ; print ("Sample std.dev:",round(dt_std,2))
               
# Calculate Range
dt_rng = np.max(game_points,axis=0) - np.min(game_points,axis=0) ; print ("Range:",dt_rng)


#Calculate percentiles
print ("Quantiles:")
for val in [20,80,100]:
    dt_qntls = np.percentile(game_points,val) 
    print (str(val)+"%" ,dt_qntls)
                                
# Calculate IQR                           
q75, q25 = np.percentile(game_points, [75 ,25]); print ("Inter quartile range:",q75-q25 )
endTime = time.time()
print('This took %s seconds to calculate.' % (endTime - startTime))

Sample variance: 400
Sample std.dev: 20.0
Range: 69
Quantiles:
20% 39.8
80% 77.4
100% 93.0
Inter quartile range: 28.5
This took 0.003005504608154297 seconds to calculate.


In [4]:
# Hypothesis testing
#import scipy                       
          
from scipy import stats              

startTime = time.time()
xbar = 990; mu0 = 1000; s = 12.5; n = 30
# Test Statistic
t_smple  = (xbar-mu0)/(s/np.sqrt(float(n))); print ("Test Statistic:",round(t_smple,2))
# Critical value from t-table
alpha = 0.05
t_alpha = stats.t.ppf(alpha,n-1); print ("Critical value from t-table:",round(t_alpha,3))          
#Lower tail p-value from t-table                        
p_val = stats.t.sf(np.abs(t_smple), n-1); print ("Lower tail p-value from t-table", p_val)                        
                      

# Normal Distribution
from scipy import stats
xbar = 67; mu0 = 52; s = 16.3

# Calculating z-score
z = (67-52)/16.3

# Calculating probability under the curve    
p_val = 1- stats.norm.cdf(z)
print ("Prob. to score more than 67 is ",round(p_val*100,2),"%")
endTime = time.time()
print('This took %s seconds to calculate.' % (endTime - startTime))

Test Statistic: -4.38
Critical value from t-table: -1.699
Lower tail p-value from t-table 7.03502572901e-05
Prob. to score more than 67 is  17.87 %
This took 0.0020728111267089844 seconds to calculate.


In [5]:
# Chi-square independence test
import pandas as pd
from scipy import stats

startTime = time.time()
survey = pd.read_csv("survey.csv")  
# Tabulating 2 variables with row & column variables respectively
survey_tab = pd.crosstab(survey.Smoke, survey.Exer, margins = True)
# Creating observed table for analysis
# book code had ix instead of iloc but it has been deprecated -Dave
observed = survey_tab.iloc[0:4,0:3] 

contg = stats.chi2_contingency(observed= observed)
p_value = round(contg[1],3)
print ("P-value is: ",p_value)
endTime = time.time()
print('This took %s seconds to calculate.' % (endTime - startTime))

P-value is:  0.483
This took 0.029573678970336914 seconds to calculate.


In [6]:
#ANOVA
import pandas as pd
from scipy import stats

startTime = time.time()
fetilizers = pd.read_csv("fetilizers.csv")

one_way_anova = stats.f_oneway(fetilizers["fertilizer1"], fetilizers["fertilizer2"], fetilizers["fertilizer3"])

print ("Statistic :", round(one_way_anova[0],2),", p-value :",round(one_way_anova[1],3))
endTime = time.time()
print('This took %s seconds to calculate.' % (endTime - startTime))

Statistic : 3.66 , p-value : 0.051
This took 0.001993417739868164 seconds to calculate.


In [7]:
# Train & Test split
import pandas as pd      
from sklearn.model_selection import train_test_split              

startTime = time.time()
original_data = pd.read_csv("mtcars.csv")     

train_data,test_data = train_test_split(original_data,train_size = 0.7,random_state=42)
endTime = time.time()
print('This took %s seconds to calculate.' % (endTime - startTime))

This took 0.0030045509338378906 seconds to calculate.


In [8]:
# Linear Regressio vs. Gradient Descent             
               
import numpy as np                        
import pandas as pd
                       
startTime = time.time()
train_data = pd.read_csv("mtcars.csv")                       
                        
X = np.array(train_data["hp"])  ; y = np.array(train_data["mpg"]) 
X = X.reshape(32,1); y = y.reshape(32,1)

from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept = True) 
 
model.fit(X,y)       
print ("Linear Regression Results")        
print ("Intercept",model.intercept_[0] ,"Coefficient",model.coef_[0])  
endTime = time.time()
print('This took %s seconds to calculate.' % (endTime - startTime))

Linear Regression Results
Intercept 30.0988605396 Coefficient [-0.06822828]
This took 0.3032088279724121 seconds to calculate.


In [9]:
 def gradient_descent(x, y,learn_rate, conv_threshold,batch_size,max_iter):    
    converged = False
    iter = 0
    m = batch_size 
 
    t0 = np.random.random(x.shape[1])
    t1 = np.random.random(x.shape[1])

    MSE = (sum([(t0 + t1*x[i] - y[i])**2 for i in range(m)])/ m)    

    while not converged:        
        grad0 = 1.0/m * sum([(t0 + t1*x[i] - y[i]) for i in range(m)]) 
        grad1 = 1.0/m * sum([(t0 + t1*x[i] - y[i])*x[i] for i in range(m)])

        temp0 = t0 - learn_rate * grad0
        temp1 = t1 - learn_rate * grad1
    
        t0 = temp0
        t1 = temp1

        MSE_New = (sum( [ (t0 + t1*x[i] - y[i])**2 for i in range(m)] ) / m)

        if abs(MSE - MSE_New ) <= conv_threshold:
            print ('Converged, iterations: ', iter)
            converged = True
    
        MSE = MSE_New   
        iter += 1 
    
        if iter == max_iter:
            print ('Max interactions reached')
            converged = True

    return t0,t1

startTime = time.time()
if __name__ == '__main__':
    Inter, Coeff = gradient_descent(x = X,y = y,learn_rate=0.00003 ,conv_threshold=1e-8, batch_size=32,max_iter=1500000)
    print ("Gradient Descent Results")
    print (('Intercept = %s Coefficient = %s') %(Inter, Coeff)) 
    
endTime = time.time()
print('This took %s seconds to calculate.' % (endTime - startTime))

Converged, iterations:  1143728
Gradient Descent Results
Intercept = [ 30.02495127] Coefficient = [-0.06781243]
This took 715.7425575256348 seconds to calculate.


In [10]:
# Train Validation Test split      

import pandas as pd      
from sklearn.model_selection import train_test_split              
                        
original_data = pd.read_csv("mtcars.csv")                   
 

def data_split(dat,trf = 0.5,vlf=0.25,tsf = 0.25):
    nrows = dat.shape[0]    
    trnr = int(nrows*trf)
    vlnr = int(nrows*vlf)    
    
    tr_data,rmng = train_test_split(dat,train_size = trnr,random_state=42)
    vl_data, ts_data = train_test_split(rmng,train_size = vlnr,random_state=45)  
    
    return (tr_data,vl_data,ts_data)

startTime = time.time()
train_data, validation_data, test_data = data_split(original_data,trf=0.5,vlf=0.25,tsf=0.25)
endTime = time.time()
print('This took %s seconds to calculate.' % (endTime - startTime))

This took 0.004014730453491211 seconds to calculate.


In [11]:
# Grid search on Decision Trees
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.pipeline import Pipeline


startTime = time.time()
input_data = pd.read_csv("ad.csv",header=None)                       

X_columns = set(input_data.columns.values)
y = input_data[len(input_data.columns.values)-1]
X_columns.remove(len(input_data.columns.values)-1)
X = input_data[list(X_columns)]

X_train, X_test,y_train,y_test = train_test_split(X,y,train_size = 0.7,random_state=33)

pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])
parameters = {
    'clf__max_depth': (50,100,150),
    'clf__min_samples_split': (2, 3),
    'clf__min_samples_leaf': (1, 2, 3)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)

print ('\n Best score: \n', grid_search.best_score_)
print ('\n Best parameters set: \n')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ('\t%s: %r' % (param_name, best_parameters[param_name]))
print ("\n Confusion Matrix on Test data \n",confusion_matrix(y_test,y_pred))
print ("\n Test Accuracy \n",accuracy_score(y_test,y_pred))
print ("\nPrecision Recall f1 table \n",classification_report(y_test, y_pred))
endTime = time.time()
print('This took %s seconds to calculate.' % (endTime - startTime))

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   30.5s finished



 Best score: 
 0.967320261438

 Best parameters set: 

	clf__max_depth: 100
	clf__min_samples_leaf: 1
	clf__min_samples_split: 2

 Confusion Matrix on Test data 
 [[816  17]
 [ 16 135]]

 Test Accuracy 
 0.966463414634

Precision Recall f1 table 
              precision    recall  f1-score   support

          0       0.98      0.98      0.98       833
          1       0.89      0.89      0.89       151

avg / total       0.97      0.97      0.97       984

This took 32.37725830078125 seconds to calculate.
