In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from sklearn.model_selection import ShuffleSplit
from sklearn.base import clone
from scipy.stats import mode

from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC , LinearSVC



import warnings
warnings.filterwarnings("ignore")


In [2]:
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)
data = pd.DataFrame(data=X, columns=['point_1', 'point_2'])
data['target'] = y
print(data.head())

    point_1   point_2  target
0  0.940291  0.122306       1
1  0.124540 -0.424775       0
2  0.261988  0.508414       0
3 -0.495238  0.072589       0
4 -0.879413  0.549373       0


In [3]:
def split_data(feature , dependent, test_size = 0.3,  random_state = 20):
    X_train, X_test, Y_train, Y_test = train_test_split(
                                    feature,  dependent,  test_size = test_size,  random_state = random_state)
    print(f"Train and test data shapes {X_train.shape} , {X_test.shape}" , end = "\n\n")
    return X_train, X_test, Y_train, Y_test


X_train, X_test, y_train, y_test = split_data(data.drop(['target'] , axis = 1) , data[['target']])

Train and test data shapes (7000, 2) , (3000, 2)



In [4]:

def tune_model(classifier , param_grid):
    grid_search = GridSearchCV(classifier,param_grid=param_grid)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_ , grid_search.best_params_

param_grid = {
    'criterion': ['gini', 'entropy'],      
    'splitter': ['best', 'random'],         
    'max_depth': [None, 10, 20, 30],           
    'min_samples_split': [2, 5, 10],           
    'min_samples_leaf': [1, 2, 4]             
}


In [5]:
decision_tree_tuned  , decision_tree_best_param= tune_model(DecisionTreeClassifier(random_state=42) , param_grid )


In [6]:
print(f"Decision tree best Params  : {decision_tree_best_param}")

Decision tree best Params  : {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'splitter': 'random'}


In [8]:
def evaluate_classification(model, X_train = X_train, X_test = X_test, Y_train =y_train , Y_test = y_test , is_fitted =True):
    if is_fitted:
        model.fit(X_train,Y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(Y_test, predictions)
    return accuracy , predictions

accuracy , _ = evaluate_classification(decision_tree_tuned , X_train, X_test, y_train, y_test)

print(f"The Accuracy of Decision Tree with parameters {decision_tree_best_param} is {accuracy*100}")

The Accuracy of Decision Tree with parameters {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'splitter': 'random'} is 86.7


In [9]:
num_of_trees = 1000
num_of_instances = 100

def shuffle_data(trees , instances):
    
    subsets = []
    shuffle_split_object = ShuffleSplit(n_splits=trees, random_state=42)

    for index, _ in shuffle_split_object.split(X_train):
        subsets.append((X_train.iloc[index], y_train.iloc[index]))
    
    return subsets

subsets = shuffle_data(num_of_trees , num_of_instances )

In [10]:
decision_tree_tuned = DecisionTreeClassifier(**decision_tree_best_param)
trees_forest = [clone(decision_tree_tuned) for _ in range(num_of_trees)]

In [11]:

def compute_accuracy(forest , subset):
    scores = []
    for tree, (x,y) in zip(forest , subset):
        scores.append(evaluate_classification(tree , X_train  = x , Y_train = y))
    return scores

accuracy_scores =  compute_accuracy(trees_forest , subsets)

In [12]:
def compute_majority_votes():
    
    sample_prediction_matrix = np.zeros((num_of_trees, len(X_test)))
    for tree_count, model_tree in enumerate(trees_forest):
        sample_prediction_matrix[tree_count] = model_tree.predict(X_test)

    predicted_majority_votes, _ = mode(sample_prediction_matrix, axis=0)

    majority_votes_accuracy = accuracy_score(y_test, predicted_majority_votes.reshape([-1]))
    
    return majority_votes_accuracy

print(f"The Accuracy of Decision Tree with majority vote is {compute_majority_votes()*100}")



The Accuracy of Decision Tree with majority vote is 86.96666666666667


In [13]:
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"].values.astype(int), mnist["target"].values.astype(int)

In [14]:


X_train, X_test, y_train, y_test = split_data(X  ,y , test_size=10000)

X_train, X_val, y_train, y_val = split_data(X_train  ,y_train , test_size=10000)

Train and test data shapes (60000, 784) , (10000, 784)

Train and test data shapes (50000, 784) , (10000, 784)



In [16]:
def initilize_classification_models():
    extra_tree_classifier = ExtraTreesClassifier( n_estimators=100,random_state=42)
    random_forest_classifier = RandomForestClassifier(n_estimators=100,random_state=42)
    svc_classifier = LinearSVC(dual=False)


    return extra_tree_classifier , random_forest_classifier , svc_classifier

extra_tree_classifier , random_forest_classifier , svc_classifier = initilize_classification_models()

In [17]:

extra_tree_accuracy , extra_tree_predictions = evaluate_classification(extra_tree_classifier , X_train, X_val, y_train, y_val)
print(f"The Accuracy of extra_tree_classifier is {extra_tree_accuracy*100}")

fitting done
The Accuracy of extra_tree_classifier is 97.21


In [18]:

random_forest_accuracy , random_forest_predictions = evaluate_classification(random_forest_classifier , X_train, X_val, y_train, y_val)
print(f"The Accuracy of random_forest_classifier is {random_forest_accuracy*100}")

fitting done
The Accuracy of random_forest_classifier is 96.74000000000001


In [None]:
svm_clf = SVC(probability=True, random_state=42)
svm_clf.fit(X_train, y_train)
svm_pred = svm_clf.predict(X_test)
svm_acc = accuracy_score(y_test, svm_pred)
svm_acc

In [None]:
svc_accuracy , svc_predictions = evaluate_classification(svc_classifier , X_train, X_val, y_train, y_val)
print(f"The Accuracy of svc_classifier is {accuracy*100}")

In [None]:

voting_classifier = VotingClassifier(
    estimators=[('random_forest_classifier', random_forest_classifier),
                ('extra_tree_classifier', extra_tree_classifier), ('svm', svc_classifier)],
    voting='hard'
)

voting_classifier.fit(X_train, y_train)

voting_classifier_accuracy_val , voting_classifier_val_predictions = evaluate_classification(voting_classifier ,
                                                                             X_train, X_val, y_train, y_val , True)

print("Ensemble Accuracy on Validation Set:", voting_classifier_accuracy_val)


voting_classifier_accuracy_test , voting_classifier_test_predictions = evaluate_classification(voting_classifier ,
                                                         X_train, X_test, y_train, y_test , True)
print()
print("Ensemble Accuracy on Test Set:", voting_classifier_accuracy_test)


In [None]:

random_forest_classifier_test_prediction = random_forest_classifier.predict(X_test)
extra_tree_classifier_test_prediction = extra_tree_classifier.predict(X_test)
svm_test_prediction = svm_clf.predict(X_test)


In [None]:

blender_train_data = np.column_stack((extra_tree_predictions, random_forest_predictions , svm_val_pred))
blender_test_data = np.column_stack((rf_test_pred, et_test_pred , svm_test_pred))


blender = RandomForestClassifier(n_estimators=100, random_state=42)
blender.fit(blender_train_data, y_val)
blender_predictions = blender.predict(blender_test_data)
blender_accuracy = accuracy_score(y_test, blender_predictions)

print("Ensemble Accuracy with Stacking on Test Set:", blender_accuracy)


In [None]:
linear_svm_clf = LinearSVC(dual=False, random_state=42, max_iter=1000, C=1)


In [None]:
linear_svm_clf.fit(X_train , y_train)