# Hyper_Parameter Tunning

The process of finding the best parameters for a model is called the hyper_parameter tunning

In [1]:
import pandas as pd 
import numpy as np
from sklearn.datasets import load_iris

# Iris Data Set

In [2]:
iris = load_iris()

In [3]:
dir(iris)

['DESCR',
 'data',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [4]:
iris.data[:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [5]:
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [7]:
df['flower']=df['target'].apply( lambda x : iris.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,flower
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [8]:
input = df.drop(['target','flower'],axis=1)
input.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [9]:
target = df.target
target.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int32

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(input,target,test_size=0.3)

In [12]:
len(X_train)

105

In [13]:
len(X_test )

45

In [14]:
from sklearn.svm import SVC

In [15]:
model=SVC(kernel="linear",C=10,gamma="auto")

In [16]:
model.fit(X_train,y_train)

SVC(C=10, gamma='auto', kernel='linear')

In [17]:
model.score(X_test,y_test)

0.9555555555555556

When ever we run the train_test_split the score changes ... So thats not a good approach

In [18]:
from sklearn.model_selection import cross_val_score

**Manualy tunning parameters agin and again by hit and trail method** 

In [19]:
clf=cross_val_score(SVC(kernel="linear",C=10,gamma="auto"),X_train,y_train,cv=3)
clf

array([0.97142857, 0.94285714, 0.94285714])

In [20]:
clf.mean()

0.9523809523809522

In [21]:
np.average(clf)

0.9523809523809522

In [22]:
clf=cross_val_score(SVC(kernel="rbf",C=50,gamma="auto"),X_train,y_train,cv=3) #  kernel -> linear , rbf + C -> integer + gamma -> auto , float
clf

array([0.97142857, 0.91428571, 0.91428571])

In [23]:
clf.mean()

0.9333333333333332

To avoid again and again writing the same  line ... run a loop  ... But it has flaw that two loops are running and the memory taken a lot and speed also consumes 

In [24]:
kernel_val=["linear","rbf"]
C_val=[10,30,50]
score_result={} # ye dictionary 
for k in kernel_val:
    for c in C_val:
        clf=cross_val_score(SVC(kernel=k,C=c,gamma="auto"),X_train,y_train,cv=3)
        score_result[k+"_"+str(c)]=np.average(clf) # dic mai adding elements

In [25]:
score_result

{'linear_10': 0.9523809523809522,
 'linear_30': 0.9333333333333332,
 'linear_50': 0.9428571428571427,
 'rbf_10': 0.9714285714285715,
 'rbf_30': 0.9428571428571427,
 'rbf_50': 0.9333333333333332}

Isi kam k liye sklearn provide a library 

In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
clf = GridSearchCV(SVC(gamma="auto"),{    # provide the param grid ( dic ) 
    "kernel":["linear","rbf"],
    "C":[10,30,50]
},cv=3,return_train_score=False)  # note is mai hum ne cross_val_score ki trah x and y ni dala ... wo agy fit krte waqt

**Problem : Computation Cost**

Is ke sath bhi 1 problem ... It will run for all the permutations and the combinations of the provided parameters using the cross_val_score ... So , ye bhi wo cross_val_score use ... Is ke name mai bhi CV ... GridSearchCV ...   

In [28]:
clf.fit(X_train,y_train)

GridSearchCV(cv=3, estimator=SVC(gamma='auto'),
             param_grid={'C': [10, 30, 50], 'kernel': ['linear', 'rbf']})

In [29]:
clf.cv_results_ # not better visualizes 

{'mean_fit_time': array([0.00664838, 0.00332451, 0.00199445, 0.00166233, 0.00099746,
        0.00132966]),
 'std_fit_time': array([3.76045422e-03, 9.40211902e-04, 1.12391596e-07, 4.70077860e-04,
        4.05233662e-07, 4.70190252e-04]),
 'mean_score_time': array([0.00398906, 0.00232657, 0.00099738, 0.00099746, 0.00099715,
        0.00133006]),
 'std_score_time': array([2.15438807e-03, 9.39931240e-04, 1.12391596e-07, 2.24783192e-07,
        4.05233662e-07, 4.70078102e-04]),
 'param_C': masked_array(data=[10, 10, 30, 30, 50, 50],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 10, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 30, 'kernel': 'linear'},
  {'C': 30, 'kernel': 'rbf'},
  {'C': 50, 

In [30]:
score_grid_search_cv = pd.DataFrame(clf.cv_results_)
score_grid_search_cv.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006648,0.003760454,0.003989,0.002154388,10,linear,"{'C': 10, 'kernel': 'linear'}",0.971429,0.942857,0.942857,0.952381,0.0134687,2
1,0.003325,0.0009402119,0.002327,0.0009399312,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.971429,1.0,0.942857,0.971429,0.02332847,1
2,0.001994,1.123916e-07,0.000997,1.123916e-07,30,linear,"{'C': 30, 'kernel': 'linear'}",0.942857,0.914286,0.942857,0.933333,0.0134687,5
3,0.001662,0.0004700779,0.000997,2.247832e-07,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.971429,0.942857,0.914286,0.942857,0.02332847,3
4,0.000997,4.052337e-07,0.000997,4.052337e-07,50,linear,"{'C': 50, 'kernel': 'linear'}",0.942857,0.942857,0.942857,0.942857,1.110223e-16,3


In [31]:
score_grid_search_cv = score_grid_search_cv[['param_kernel','param_C','mean_test_score']]
score_grid_search_cv

Unnamed: 0,param_kernel,param_C,mean_test_score
0,linear,10,0.952381
1,rbf,10,0.971429
2,linear,30,0.933333
3,rbf,30,0.942857
4,linear,50,0.942857
5,rbf,50,0.933333


In [32]:
dir(clf) # to show properties 

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs',
 'n_splits

In [33]:
clf.best_params_

{'C': 10, 'kernel': 'rbf'}

In [34]:
clf.best_score_

0.9714285714285715

**To over come this there is sklearn's RandomizedSearchCv ... same as GridSearch lakin is mai n_iter - > kh kitni combination 1 run mai show or execute** 

In [35]:
from sklearn.model_selection import RandomizedSearchCV

In [36]:
clf = RandomizedSearchCV(
    SVC(gamma="auto"),
    {    
    "kernel":["linear","rbf"],
    "C":[10,30,50]
    },
    cv=3,
    return_train_score=False,
    n_iter=3
)

In [37]:
clf.fit(X_train,y_train)

RandomizedSearchCV(cv=3, estimator=SVC(gamma='auto'), n_iter=3,
                   param_distributions={'C': [10, 30, 50],
                                        'kernel': ['linear', 'rbf']})

In [38]:
score_randomized_search_cv = pd.DataFrame(clf.cv_results_)
score_randomized_search_cv[['param_kernel','param_C','mean_test_score']]

Unnamed: 0,param_kernel,param_C,mean_test_score
0,rbf,50,0.933333
1,linear,50,0.942857
2,rbf,10,0.971429


# Model Tunning 

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

Defing the prameters grid

In [40]:
model_param = { # dictionary
    "logistic_reg":{
        "model":LogisticRegression(max_iter=200), # max_iter -> to avoid warnings 
        "params":{
            "C":[10,15,20]
        }
    },
    "svc":{
        "model":SVC(gamma="auto"),
        "params":{
            "kernel":["linear","rbf"],
            "C":[10,15,20]
        }
    },
    "random_forest":{
        "model":RandomForestClassifier(),
        "params":{
            "n_estimators":[10,20,30]
        }
    }
}

In [41]:
model_param

{'logistic_reg': {'model': LogisticRegression(max_iter=200),
  'params': {'C': [10, 15, 20]}},
 'svc': {'model': SVC(gamma='auto'),
  'params': {'kernel': ['linear', 'rbf'], 'C': [10, 15, 20]}},
 'random_forest': {'model': RandomForestClassifier(),
  'params': {'n_estimators': [10, 20, 30]}}}

In [42]:
scores = []

for model_n,model_p in model_param.items() :
    clf =GridSearchCV(model_p["model"],
                     model_p["params"],
                     cv=3,
                     return_train_score=False
                     )
    clf.fit(X_train,y_train)
    scores.append({ # list mai append hota aur dic mai hum direct add krte 
        "model":model_n,
        "best_score":clf.best_score_,
        "best_params":clf.best_params_
    })

In [43]:
scores

[{'model': 'logistic_reg',
  'best_score': 0.9619047619047619,
  'best_params': {'C': 10}},
 {'model': 'svc',
  'best_score': 0.9714285714285715,
  'best_params': {'C': 10, 'kernel': 'rbf'}},
 {'model': 'random_forest',
  'best_score': 0.9523809523809524,
  'best_params': {'n_estimators': 20}}]

In [44]:
df_scores = pd.DataFrame(scores)
df_scores

Unnamed: 0,model,best_score,best_params
0,logistic_reg,0.961905,{'C': 10}
1,svc,0.971429,"{'C': 10, 'kernel': 'rbf'}"
2,random_forest,0.952381,{'n_estimators': 20}


**Now do the hyper parameter tunning and the model tunning for the sklearn.datasets ( load_digits )**