<a href="https://colab.research.google.com/github/CPTR295/ML1/blob/main/Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
X,y=make_moons(n_samples=500,noise=.30,random_state=42)

In [2]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [3]:
log_clf=LogisticRegression()
rnd_clf = RandomForestClassifier()
svc_clf=SVC()

v_clf=VotingClassifier( #Hard looks for majority
    estimators=[('lr',log_clf),('rf',rnd_clf),('sf',svc_clf)],
    voting='hard',
    verbose=False
)
v_clf.fit(X_train,y_train)

In [4]:
from sklearn.metrics import accuracy_score

for clf in (log_clf,rnd_clf,svc_clf,v_clf):
  clf.fit(X_train,y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

LogisticRegression 0.85
RandomForestClassifier 0.89
SVC 0.87
VotingClassifier 0.87


In [6]:
svc_clf=SVC(probability=True) #We need proba in soft voting as it attaches more weight to high confidence votes
v2_clf = VotingClassifier(
    estimators=[('rf',rnd_clf),('sf',svc_clf),('lr',log_clf)],
    voting='soft'
)
v2_clf.fit(X_train,y_train)

In [7]:
for clf in (log_clf,rnd_clf,svc_clf,v2_clf):
  clf.fit(X_train,y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

LogisticRegression 0.85
RandomForestClassifier 0.88
SVC 0.87
VotingClassifier 0.88


In [8]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),n_estimators=500,max_samples=100,bootstrap=True,n_jobs=-1 #Bootstrap to select bagging(With replacement) or pasting. njons to select no of cpu core
)
bag_clf.fit(X_train,y_train)


In [9]:
y_pred=bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.9

In [10]:
d_clf=DecisionTreeClassifier()
d_clf.fit(X_train,y_train)

y_pred=d_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.84

In [11]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,
                            bootstrap=True,oob_score=True,n_jobs=-1,random_state=42) #in bagging intances which are not sampled oob = Out-Of_bag Evaluation
bag_clf.fit(X_train,y_train)

In [12]:
bag_clf.oob_score_

0.91

In [13]:
y_pred=bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.88

In [14]:
rnd_clf = RandomForestClassifier(n_estimators=500,max_leaf_nodes=16,n_jobs=-1)
rnd_clf.fit(X_train,y_train)
y_pred=rnd_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.89

In [15]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(splitter='random',max_leaf_nodes=16),n_estimators=500,max_samples=1.0,bootstrap=True,n_jobs=-1) #Will get same model as above

In [16]:
bag_clf.fit(X_train,y_train)
y_pred=bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.89

In [17]:
rnd_clf.feature_importances_

array([0.4423898, 0.5576102])

In [18]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.11249225099876375
sepal width (cm) 0.02311928828251033
petal length (cm) 0.4410304643639577
petal width (cm) 0.4233579963547682


In [19]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784',version=1,as_frame=False)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [20]:
X,y=mnist["data"],mnist["target"]

In [22]:
X_train_val,X_test,y_train_val,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [23]:
X_train,X_val,y_train,y_val=train_test_split(X_train_val,y_train_val,test_size=0.2,random_state=42)

In [24]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [25]:
ran_clf=RandomForestClassifier(n_estimators=100,random_state=42)
svm_clf=SVC(max_iter=100,tol=20,random_state=42)
extra_clf=ExtraTreesClassifier(n_estimators=100,random_state=42)
mlp_clf=MLPClassifier(random_state=42)

In [26]:
estimators=[ran_clf,extra_clf,svm_clf,mlp_clf]
for estimator in estimators:
  print("Training",estimator)
  estimator.fit(X_train,y_train)

Training RandomForestClassifier(random_state=42)
Training ExtraTreesClassifier(random_state=42)
Training SVC(max_iter=100, random_state=42, tol=20)
Training MLPClassifier(random_state=42)


In [27]:
[estimator.score(X_val,y_val) for estimator in estimators]

[0.96625, 0.9716071428571429, 0.09571428571428571, 0.9594642857142858]

In [50]:
estimator_v_clf =[
    ("Ran Clf",ran_clf),
    ("Extra Clf",extra_clf),
    ("svm_clf",svm_clf),
    ("mlp clf",mlp_clf)
]

In [52]:
v_clf=VotingClassifier(estimator_v_clf)
v_clf.fit(X_train,y_train)

In [53]:
v_clf.score(X_val,y_val)

0.9733928571428572

In [54]:
[estimator.score(X_val, y_val) for estimator in v_clf.estimators_]

[0.0, 0.0, 0.0, 0.0]

In [55]:
v_clf.set_params(svm_clf=None)

In [56]:
v_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42),
 SVC(max_iter=100, random_state=42, tol=20),
 MLPClassifier(random_state=42)]

In [57]:
v_clf.estimators

[('Ran Clf', RandomForestClassifier(random_state=42)),
 ('Extra Clf', ExtraTreesClassifier(random_state=42)),
 ('svm_clf', None),
 ('mlp clf', MLPClassifier(random_state=42))]

In [58]:
del v_clf.estimators_[2]

In [59]:
v_clf.estimators

[('Ran Clf', RandomForestClassifier(random_state=42)),
 ('Extra Clf', ExtraTreesClassifier(random_state=42)),
 ('svm_clf', None),
 ('mlp clf', MLPClassifier(random_state=42))]

In [60]:
v_clf.score(X_val,y_val)

0.9729464285714285

In [61]:
v_clf.voting="soft"
v_clf.score(X_val,y_val)

0.9677678571428572

In [62]:
v_clf.voting="hard"
v_clf.score(X_test,y_test)

0.9696428571428571

In [65]:
[estimator.score(X_test, y_test) for estimator in v_clf.estimators_]
#When a VotingClassifier is fitted, it processes the target labels (y_train) in a way that is optimized for its internal operations, such as ensuring consistency across different base estimators.
#This transformed label format is what each base estimator in v_clf.estimators_ actually learns during the training phase.

[0.0, 0.0, 0.0]

In [66]:
individual_scores = []
for estimator in v_clf.estimators_:
    y_pred = estimator.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    individual_scores.append(score)

print(individual_scores)

[0.0, 0.0, 0.0]


In [67]:
y_pred=v_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9696428571428571