In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import r2_score
from sklearn.naive_bayes import GaussianNB


In [8]:
housing = pd.read_csv("/home/darkstar/Documents/pg-dbda/module7_statistics/Daywise Study Material/datasets/Housing.csv")
dumm_housing = pd.get_dummies(housing, drop_first=True)

In [27]:
X = dumm_housing.drop(['price'], axis=1)
y = dumm_housing['price']

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    random_state=23,
                                                    test_size=0.3)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(r2_score(y_test, y_pred))


scores = []
depth_values = [1,2,3,4,5,6,7,8,9,10]
for depth in depth_values:
    dtr = DecisionTreeRegressor(max_depth=5,random_state=23)
    dtr.fit(X_train, y_train)
    ycap = dtr.predict(X_test)
    scores.append(r2_score(y_test, ycap))

i_max = np.argmax(scores)
print("Best Depth =", depth_values[i_max])
print("Best Score =",scores[i_max])

0.6543071090954233
Best Depth = 1
Best Score = 0.41182065789649225


### Linear Regression using KFold

In [28]:
lr = LinearRegression()
kfold = KFold(n_splits=5, shuffle=True, random_state=23)
results = cross_val_score(lr, X, y, cv=kfold)

print(results.mean())

0.6494120332827323


### Tree using KFold

In [36]:
dtr = DecisionTreeRegressor(max_depth=5, random_state=23)
kfold = KFold(n_splits=5, shuffle=True, random_state=23)
results = cross_val_score(dtr, X, y, cv = kfold)
print(results.mean())

0.3849113926682245


In [40]:
scores=[]
depth_values = [1,2,3,4,5,6,7,8,9,10]
for i in depth_values:
    dtr = DecisionTreeRegressor(max_depth=i, random_state=23)
    results = cross_val_score(dtr, X, y, cv=kfold)
    scores.append(results.mean())
    
i_max = np.argmax(scores)
print("Best Depth: ", depth_values[i_max])
print("Best scores: ", scores[i_max])

Best Depth:  3
Best scores:  0.4118512734223138


### Grid Search CV

In [44]:
from sklearn.model_selection import GridSearchCV
depth_values = [1,2,3,4,5,6,7,8,9,10]
dtr = DecisionTreeRegressor(random_state=23)
params = {'max_depth': depth_values}
print(dtr.get_params())

gcv = GridSearchCV(dtr, param_grid=params, cv=kfold, scoring='neg_mean_squared_error')
gcv.fit(X,y)
print(gcv.best_params_)
print(gcv.best_score_)

{'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 23, 'splitter': 'best'}
{'max_depth': 6}
-403237987.08837956


### using kFold and GridSearchCV for Bankruptcy Datasets

In [46]:
bankruptcy = pd.read_csv("/home/darkstar/Documents/pg-dbda/module7_statistics/Daywise Study Material/cases/Bankruptcy/Bankruptcy.csv")
bankruptcy

Unnamed: 0,NO,D,YR,R1,R2,R3,R4,R5,R6,R7,...,R15,R16,R17,R18,R19,R20,R21,R22,R23,R24
0,1,0,78,0.23,0.08,0.02,0.03,0.46,0.12,0.19,...,0.05,0.57,0.15,0.23,3.56,0.26,1.55,0.43,0.11,0.17
1,2,0,77,0.19,0.07,0.09,0.12,0.02,0.02,0.03,...,0.09,0.12,0.16,0.22,3.78,1.29,1.40,0.06,0.07,0.10
2,3,0,72,0.07,0.02,0.03,0.05,0.06,0.10,0.14,...,-0.03,0.02,0.02,0.04,13.29,1.61,1.43,0.03,0.05,0.07
3,4,0,80,0.07,0.03,0.04,0.04,0.04,0.06,0.06,...,-0.02,0.01,0.02,0.02,5.36,1.30,1.12,-0.06,-0.08,-0.09
4,5,0,81,0.09,0.02,0.03,0.04,0.06,0.08,0.11,...,0.02,0.07,0.10,0.14,7.74,1.48,1.41,0.03,0.04,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,128,1,77,0.27,0.03,0.07,0.10,0.09,0.18,0.26,...,0.11,0.06,0.12,0.17,7.56,2.07,1.45,0.06,0.13,0.19
128,129,1,77,0.32,0.03,0.03,0.09,0.05,0.06,0.16,...,0.17,0.07,0.09,0.26,5.99,1.27,2.74,0.06,0.08,0.21
129,130,1,78,0.08,0.01,0.02,0.05,0.04,0.07,0.15,...,0.19,0.07,0.12,0.26,7.14,1.89,2.10,0.07,0.12,0.26
130,131,1,78,0.14,0.01,0.05,0.07,0.02,0.09,0.14,...,0.07,0.02,0.10,0.15,170.96,4.55,1.45,0.02,0.10,0.14


In [48]:
X = bankruptcy.drop(['NO', 'D', 'YR'], axis = 1)
y = bankruptcy['D']

In [52]:
kfold = KFold(n_splits=5, shuffle=True, random_state=23)
depth_values=[1,2,3,4,5,6,7,8,9,10]
dtc = DecisionTreeClassifier(random_state=23)
params = {'max_depth': depth_values}
gcv = GridSearchCV(dtc, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

{'max_depth': 1}
-0.6021577712001535


In [55]:
nb = GaussianNB()
results = cross_val_score(nb, X, y,cv=kfold, 
                          scoring='neg_log_loss')
print(results.mean())

-2.9486181387930124
