In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.datasets import load_iris


In [37]:
iris= load_iris()

In [38]:
# split data into x and y
# all x values of iris data :: Sepal length, sepal width, petal l, petal w

x= iris.data
y= iris.target

In [39]:
#train test split
from sklearn.cross_validation import train_test_split

In [40]:
xtrain, xtest, ytrain , ytest= train_test_split(x, y, random_state=123)

In [41]:
#implementing decision tree
from sklearn.tree import DecisionTreeClassifier

In [42]:
tree= DecisionTreeClassifier()
model_tree= tree.fit(xtrain, ytrain)

In [43]:
# make predictions and find the accuracy
pred_tree=tree.predict(xtest)

print(metrics.accuracy_score(ytest, pred_tree))

0.9473684210526315


In [44]:
#implementing KNN

from sklearn.neighbors import KNeighborsClassifier


In [45]:
knn= KNeighborsClassifier()

In [46]:
model_knn= knn.fit(xtrain, ytrain)

In [47]:
pred_knn= knn.predict(xtest)

In [48]:
print(metrics.accuracy_score(ytest, pred_knn))

0.9736842105263158


In [49]:
# cross validation
from sklearn.cross_validation import cross_val_score

In [50]:
# 10 fold cv 
scores_tree= cross_val_score(tree, x, y, scoring= 'accuracy', cv=10)

In [51]:
scores_tree

array([1.        , 0.93333333, 1.        , 0.93333333, 0.93333333,
       0.86666667, 0.93333333, 1.        , 1.        , 1.        ])

In [52]:
print(np.mean(scores_tree))

0.96


In [53]:
scores_knn= cross_val_score(knn, x, y, scoring= 'accuracy', cv=10)

In [54]:
scores_knn

array([1.        , 0.93333333, 1.        , 1.        , 0.86666667,
       0.93333333, 0.93333333, 1.        , 1.        , 1.        ])

In [55]:
print(np.mean(scores_knn))

0.9666666666666668


In [56]:
# logistic regression
from sklearn.linear_model import LogisticRegression

In [57]:
lr= LogisticRegression()

In [58]:
lr_model= lr.fit(xtrain, ytrain)

In [59]:
pred_lr= lr.predict(xtest)

In [60]:
print(metrics.accuracy_score(ytest, pred_lr))

0.9736842105263158


In [61]:
scores_lr= cross_val_score(lr, x, y, scoring= 'accuracy', cv=10)
scores_lr.mean()

0.9533333333333334

In [62]:
# random forest

from sklearn.ensemble import RandomForestClassifier

In [63]:
rf= RandomForestClassifier()

In [64]:
rf_model= rf.fit(xtrain, ytrain)

In [65]:
pred_rf= rf.predict(xtest)

In [66]:
print(metrics.accuracy_score(ytest, pred_rf))

0.9473684210526315


In [67]:
scores_rf= cross_val_score(rf, x, y, scoring= 'accuracy', cv=10)
scores_rf.mean()

0.9466666666666667

In [68]:
Table = {'Models': ['Decision Tree', 'Random Forest', 'K Nearest Neighbour', 'Logistic Regression'], 'Accuracy': [0.96, 0.96,0.9666668,0.9533333333333334
]}
df = pd.DataFrame(data=Table)
df

Unnamed: 0,Models,Accuracy
0,Decision Tree,0.96
1,Random Forest,0.96
2,K Nearest Neighbour,0.966667
3,Logistic Regression,0.953333


In [115]:
# parameter tuning using grid search
#parameter grid

rf_parameters={
    'n_estimators': [200,500],
    'criterion': ['gini', 'entropy'],
    'max_depth': [4,5,6,7,8],
    'max_features': ['auto', 'sqrt', 'log2', None] ,
    'bootstrap': [True, False]
}

In [116]:
from sklearn.model_selection import GridSearchCV

In [117]:
grid_srch= GridSearchCV(estimator=rf, param_grid=rf_parameters, cv=5)

In [118]:
grid_srch.fit(xtrain,ytrain)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [200, 500], 'criterion': ['gini', 'entropy'], 'max_depth': [4, 5, 6, 7, 8], 'max_features': ['auto', 'sqrt', 'log2', None], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [119]:
grid_srch.best_params_

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 4,
 'max_features': None,
 'n_estimators': 500}

In [120]:
#parameter tuning for decision tree

dtree_parameters={
    
    'criterion': ['gini', 'entropy'],
    'max_depth': [2,3,4,5,6],
    'max_features': ['auto', 'sqrt', 'log2'] ,
    'splitter': ['best', 'random'],
    'min_samples_split': [10,20,30,50],
    'random_state':[123]
    
}

In [121]:
grid_srch1= GridSearchCV(estimator=tree, param_grid=dtree_parameters, cv=5)

In [122]:
grid_srch1.fit(xtrain,ytrain)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4, 5, 6], 'max_features': ['auto', 'sqrt', 'log2'], 'splitter': ['best', 'random'], 'min_samples_split': [10, 20, 30, 50], 'random_state': [123]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [123]:
grid_srch1.best_params_

{'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'auto',
 'min_samples_split': 30,
 'random_state': 123,
 'splitter': 'best'}

## MNIST Dataset

In [4]:
M= pd.read_csv('MNIST.csv')

NameError: name 'pd' is not defined

In [12]:
M.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
x1 = M.drop('label', axis=1)

In [14]:
y1=M['label']

In [17]:
# lets apply decision tree
x1train, x1test, y1train, y1test= train_test_split(x1,y1, test_size=0.30, random_state=123)

In [18]:
m_tree= DecisionTreeClassifier()

In [19]:
mtree= m_tree.fit(x1train, y1train)

In [20]:
mtree_pred= m_tree.predict(x1test)

In [21]:
print(metrics.accuracy_score(y1test, mtree_pred)) # bqse line prediction

0.8532539682539683


In [22]:
# random forest
m_rf= RandomForestClassifier()

In [23]:
model_mrf= m_rf.fit(x1train, y1train)

In [24]:
rfm_pred= m_rf.predict(x1test)
print(metrics.accuracy_score(y1test, rfm_pred)) 

0.9364285714285714


In [25]:
# apply ensemble voting classifier
from sklearn.ensemble import VotingClassifier


In [30]:
voting_model= VotingClassifier([('tree', mtree), ("Rf", m_rf)], voting= 'hard')

In [32]:
voting_model.fit(x1train, y1train)

VotingClassifier(estimators=[('tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_le...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [33]:
voting_predict= voting_model.predict(x1test)

In [34]:
print(metrics.accuracy_score(y1test, voting_predict)) 

0.8942063492063492


In [124]:
grid_srch1.fit(x1train,y1train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4, 5, 6], 'max_features': ['auto', 'sqrt', 'log2'], 'splitter': ['best', 'random'], 'min_samples_split': [10, 20, 30, 50], 'random_state': [123]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [75]:
## Bagging Classifier

from sklearn.ensemble import BaggingClassifier

In [84]:
bagg_predict= bagg_model.predict(x1test)

In [85]:
print(metrics.accuracy_score(y1test,bagg_predict)) 

0.937936507936508


In [86]:
#averaging the scores

mdtree_prob= m_tree.predict_proba(x1test)

In [87]:
m_rf_prob= m_rf.predict_proba(x1test)

In [89]:
scores_prob=(mdtree_prob+m_rf_prob)/2  # accuracy level

In [97]:
scores_prob

array([[0.  , 0.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.9 ],
       [0.25, 0.  , 0.1 , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.05, 0.05]])

In [2]:
#boosting
from sklearn.ensemble import AdaBoostClassifier

In [3]:
boost= AdaBoostClassifier(base_estimator=m_tree, n_estimators=5, learning_rate=1)

NameError: name 'm_tree' is not defined

In [110]:
boost_model= boost.fit(x1train, y1train)

In [111]:
boost_predict= boost_model.predict(x1test)

In [None]:
print(metrics.accuracy_score(y1test,boost_predict)) # this means we got to tune our decision tree so that accuracy increases

## Car Seats Data

In [130]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor


In [126]:
car_seats= pd.read_csv("carseats.csv")

In [127]:
car_seats.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLocBad,ShelveLocGood,ShelveLocMedium,Age,Education,UrbanNo,UrbanYes,USNo,USYes
0,9.5,138,73,11,276,120,1,0,0,42,17,0,1,0,1
1,11.22,111,48,16,260,83,0,1,0,65,10,0,1,0,1
2,10.06,113,35,10,269,80,0,0,1,59,12,0,1,0,1
3,7.4,117,100,4,466,97,0,0,1,55,14,0,1,0,1
4,4.15,141,64,3,340,128,1,0,0,38,13,0,1,1,0


In [160]:
car_seats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 15 columns):
Sales              400 non-null float64
CompPrice          400 non-null int64
Income             400 non-null int64
Advertising        400 non-null int64
Population         400 non-null int64
Price              400 non-null int64
ShelveLocBad       400 non-null int64
ShelveLocGood      400 non-null int64
ShelveLocMedium    400 non-null int64
Age                400 non-null int64
Education          400 non-null int64
UrbanNo            400 non-null int64
UrbanYes           400 non-null int64
USNo               400 non-null int64
USYes              400 non-null int64
dtypes: float64(1), int64(14)
memory usage: 47.0 KB


In [129]:
x= car_seats.drop("Sales", axis=1)
y= car_seats["Sales"]

In [162]:
x.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLocBad,ShelveLocGood,ShelveLocMedium,Age,Education,UrbanNo,UrbanYes,USNo,USYes
0,138,73,11,276,120,1,0,0,42,17,0,1,0,1
1,111,48,16,260,83,0,1,0,65,10,0,1,0,1
2,113,35,10,269,80,0,0,1,59,12,0,1,0,1
3,117,100,4,466,97,0,0,1,55,14,0,1,0,1
4,141,64,3,340,128,1,0,0,38,13,0,1,1,0


In [163]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [164]:
L = []
M = []
for i in range(1,11):
    
    df_tree =DecisionTreeRegressor(random_state=1,max_depth=i)
    df_tree = df_tree.fit(x_train,y_train)
    pred = df_tree.predict(x_test)
    RMSE = np.sqrt(np.mean((y_test-pred)**2))
    MAPE = np.mean(np.absolute((y_test-pred)/pred))
    L.append(RMSE)
    M.append(MAPE)

In [165]:
L

[2.3634544771379717,
 2.243026424920366,
 2.2226132333223503,
 2.249716890727332,
 2.279518514927684,
 2.31827849982753,
 2.3810354518680414,
 2.436202603112295,
 2.4595669868355854,
 2.400381044165717]

In [166]:
M

[0.26009627662050894,
 0.2555719894293263,
 0.25477244747817995,
 0.2719330568175589,
 0.2882387440034535,
 0.30342477662915635,
 0.3035580110579158,
 0.3061197537347961,
 0.31381710999208223,
 0.31362257632176804]

In [169]:
min_rmse = min(L)
max_dep = L.index(min_rmse)+1
print("Minimum Depth : ",max_dep)

Minimum Depth :  3


In [170]:
    df_tree =DecisionTreeRegressor(random_state=1,max_depth=3)
    df_tree = df_tree.fit(x_train,y_train)
    pred = df_tree.predict(x_test)
    RMSE = np.sqrt(np.mean((y_test-pred)**2))
    MAPE = np.mean(np.absolute((y_test-pred)/pred))

In [171]:
RMSE, MAPE

(2.2226132333223503, 0.25477244747817995)

In [172]:
rf_test_rmse = []
rf_test_mape = []
for i in range(1,9):
    rfc = RandomForestRegressor(n_estimators=500, max_features=i)
    rfc.fit(x_train,y_train)
    pred = rfc.predict(x_test)
    RMSE = np.sqrt(np.mean((y_test-pred)**2))
    MAPE = np.mean(np.absolute((y_test-pred)/pred))
    rf_test_rmse.append(RMSE)
    rf_test_mape.append(MAPE)

In [173]:
min_rmse_rf = min(rf_test_rmse)
max_dep = rf_test_rmse.index(min_rmse_rf) + 1
print("Minimum Depth : ",max_dep)

Minimum Depth :  8


In [174]:
rfc = RandomForestRegressor(max_features=7)
rfc.fit(x_train,y_train)
pred1 = rfc.predict(x_test)
RMSE1 = np.sqrt(np.mean((y_test-pred)**2))
MAPE1 = np.mean(np.absolute((y_test-pred)/pred))

In [175]:
RMSE1, MAPE1

(1.6986900874154007, 0.19483007839507585)

In [178]:
from sklearn.ensemble import BaggingRegressor

bagg= BaggingRegressor(base_estimator=df_tree,n_estimators=20, bootstrap=True, oob_score=True)
bagg

BaggingRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=20, n_jobs=1, oob_score=True,
         random_state=None, verbose=0, warm_start=False)

In [179]:
bagg_model= bagg.fit(x_train, y_train)

In [182]:
bagg_predict= bagg_model.predict(x_test)

In [184]:
RMSE_bagg = np.sqrt(np.mean((y_test-bagg_predict)**2))
MAPE_bagg = np.mean(np.absolute((y_test-bagg_predict)/bagg_predict))

In [185]:
RMSE_bagg, MAPE_bagg

(2.018365310099587, 0.22681440718775192)

In [1]:
from sklearn.ensemble import AdaBoostRegressor


In [187]:
boost= AdaBoostRegressor(base_estimator=df_tree, n_estimators=20, learning_rate=1)

In [188]:
boost_model= boost.fit(x_train, y_train)

In [189]:
boost_predict= boost_model.predict(x_test)

In [190]:
RMSE_boost = np.sqrt(np.mean((y_test-boost_predict)**2))
MAPE_boost = np.mean(np.absolute((y_test-boost_predict)/boost_predict))

In [191]:
RMSE_boost, MAPE_boost

(1.8473313206346218, 0.20582313325641188)