<h2 align='center' style='color:purple'>Problem-Set2 Part 1</h2>

**For Diabetes dataset in sklearn library, we are going to find out best model and best hyper parameters using GridSearchCV and RandomisedSearchCV**

In [1]:
from sklearn import svm, datasets
diabetes = datasets.load_diabetes()

In [2]:
diabetes.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

In [3]:
diabetes

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]], shape=(442, 10)),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142.

#### Features (10 total):

* Age
* Sex
* Body Mass Index (BMI)
* Average Blood Pressure
* Six blood serum measurements (e.g., cholesterol levels, etc.)

#### Target:

A continuous value indicating the progression of diabetes after one year.

In [4]:
import pandas as pd
df = pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
df['target'] = diabetes.target

df[47:50]

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
47,-0.078165,-0.044642,-0.07303,-0.057313,-0.084126,-0.074277,-0.024993,-0.039493,-0.018114,-0.08392,142.0
48,0.067136,0.05068,-0.041774,0.011544,0.002559,0.005889,0.041277,-0.039493,-0.059471,-0.021788,75.0
49,-0.04184,0.05068,0.014272,-0.00567,-0.012577,0.006202,-0.072854,0.07121,0.035459,-0.013504,142.0


<h3 style='color:blue'> train_test_split </h3>

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2,random_state = 0)

In [6]:

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [7]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
score = model.score(X_test, y_test)
print(f"Accuracy: {score}")

Mean Squared Error: 3424.259334298692
Accuracy: 0.3322332173106184


In [8]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


In [9]:
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Mean Squared Error: {mse_rf}")
score_rf = rf.score(X_test, y_test)
print(f"Accuracy: {score_rf}")

Mean Squared Error: 3750.300122471911
Accuracy: 0.26865181564422547


<h3 style='color:blue'>Use GridSearchCV</h3>

In [13]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

model_params = {
    'linear_regression': {
        'model': LinearRegression(),
        'params' : {
            'tol': [1e-06,1e-08,1e-10]
        }
    },
    'random_forest': {
        'model': RandomForestRegressor(random_state = 0),
        'params' : {
            'min_samples_split':[2,4,6], 
            'min_samples_leaf':[1,2,3],
            'n_estimators': [100,500,1000]
        }
    }
}

In [14]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(diabetes.data, diabetes.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.482316,{'tol': 1e-06}
1,random_forest,0.436142,"{'min_samples_leaf': 3, 'min_samples_split': 2..."


<h3 style='color:blue'>Use RandomisedSearchCV</h3>

In [15]:
scores = []

for model_name, mp in model_params.items():
    rs = RandomizedSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False,n_iter=2)
    rs.fit(diabetes.data, diabetes.target)
    scores.append({
        'model': model_name,
        'best_score': rs.best_score_,
        'best_params': rs.best_params_
    })

df1 = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df1

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.482316,{'tol': 1e-06}
1,random_forest,0.429375,"{'n_estimators': 500, 'min_samples_split': 2, ..."
