In [14]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import matplotlib.cbook
warnings.filterwarnings("ignore", category=matplotlib.cbook.mplDeprecation)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score,recall_score, precision_score
from sklearn.metrics import average_precision_score, roc_auc_score, roc_curve, auc
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import glob
import cv2

In [24]:
def nearestNeighbor(x_train, y_train, x_test, y_test, parameters_nn):
    nnc = KNeighborsClassifier()
    grid_search = GridSearchCV(nnc, parameters_nn, cv=5, scoring='accuracy')
    grid_search.fit(x_train, y_train)
    print("Best Parameters:", grid_search.best_params_)
    
    cv_scores = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=5, scoring='accuracy')
    mean_cv_error = cv_scores.mean()
    print("Cross-Validation Error Rate:", mean_cv_error)
    
    test_accuracy = grid_search.best_estimator_.score(x_test, y_test)
    print("Test accuracy with best parameters:", test_accuracy)
    
    
def decisionTree(x_train, y_train, x_test, y_test, parameters_dt):
    dtc = DecisionTreeClassifier()
    grid_search = GridSearchCV(dtc, parameters_dt, cv=5, scoring='accuracy')
    grid_search.fit(x_train, y_train)
    print("Best Parameters:", grid_search.best_params_)
    
    cv_scores = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=5, scoring='accuracy')
    mean_cv_error = cv_scores.mean()
    print("Cross-Validation Error Rate:", mean_cv_error)
        
    test_accuracy = grid_search.best_estimator_.score(x_test, y_test)
    print("Test accuracy with best parameters:", test_accuracy)

    
def randomForest(x_train, y_train, x_test, y_test, parameters_rf):
    rfc = RandomForestClassifier(n_estimators=500, min_samples_leaf=5)
    grid_search = GridSearchCV(rfc, parameters_rf, cv=5, scoring='accuracy')
    grid_search.fit(x_train, y_train)
    print("Best Parameters:", grid_search.best_params_)
    
    cv_scores = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=5, scoring='accuracy')
    mean_cv_error = cv_scores.mean()
    print("Cross-Validation Error Rate:", mean_cv_error)
    
    test_accuracy = grid_search.best_estimator_.score(x_test, y_test)
    print("Test accuracy with best parameters:", test_accuracy)

    
def XGBoost(x_train, y_train, x_test, y_test, parameters_xgb):
    xgbc = XGBClassifier()
    grid_search = GridSearchCV(xgbc, parameters_xgb, cv=5, scoring='accuracy')
    grid_search.fit(x_train, y_train)
    print("Best Parameters:", grid_search.best_params_)
    
    cv_scores = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=5, scoring='accuracy')
    mean_cv_error = cv_scores.mean()
    print("Cross-Validation Error Rate:", mean_cv_error)
        
    test_accuracy = grid_search.best_estimator_.score(x_test, y_test)
    print("Test accuracy with best parameters:", test_accuracy)

In [3]:
def encode_data(dataframe_series):
    if dataframe_series.dtype=='object':
        dataframe_series = LabelEncoder().fit_transform(dataframe_series)
    return dataframe_series

## Dataset1 Telco customer churn

**Telco Customer Churn Dataset**

A dataset from a fictional telco company with 7043 customers in California during Q3.

**Class Imbalance:** Unbalanced dataset due to churn label disparity
- Observations: 7043
- Variables: 33

In [20]:
# Load the datasets
df_1_telco = pd.read_excel('Telco_customer_churn.xlsx')

In [21]:
# Get the number of columns of each data type
print(f"""For this dataset;
Number of float features: {len(df_1_telco.select_dtypes('float').columns)}
Number of int features: {len(df_1_telco.select_dtypes('int').columns)}
Number of object features: {len(df_1_telco.select_dtypes('object').columns)}
""")  

For this dataset;
Number of float features: 3
Number of int features: 6
Number of object features: 24



In [22]:
data_na = (df_1_telco.isnull().sum() / len(df_1_telco)) 
data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'percentage of missing values' :data_na, "number of missing value" : df_1_telco[data_na.index].isna().sum()})
missing_data

Unnamed: 0,percentage of missing values,number of missing value
Churn Reason,0.73463,5174


#### **Preprocessing**

In [23]:
df_1_telco['Total Charges'] = pd.to_numeric(df_1_telco['Total Charges'], errors='coerce')
df_1_telco['Total Charges'] = np.where(df_1_telco['Total Charges'].isnull(), 
                               df_1_telco['Monthly Charges'] * df_1_telco['Tenure Months'], df_1_telco['Total Charges'])

df_1_telco = df_1_telco.drop(['Zip Code','Churn Reason','City','Churn Score',
              'Churn Value','CLTV','CustomerID','Lat Long',
                  'Latitude','Longitude','Count','Country','State'], axis = 1)

df_1_telco['Churn Label'].replace(to_replace='Yes', value=1, inplace=True)
df_1_telco['Churn Label'].replace(to_replace='No',  value=0, inplace=True)

df_1_telco = df_1_telco.apply(lambda x: encode_data(x))

smote = SMOTE(sampling_strategy = 1)
x = df_1_telco.drop("Churn Label", axis = 1)
y = df_1_telco['Churn Label']
# x,y = smote.fit_resample(x,y)

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, test_size = 0.2)

#### **Modelling**

In [24]:
parameters_nn = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Vary the number of neighbors
    'p': [1, 2]
}
nearestNeighbor(x_train, y_train, x_test, y_test, parameters_nn)

Best Parameters: {'n_neighbors': 11, 'p': 1}
Cross-Validation Error Rate: 0.7777808072800515
Test accuracy with best parameters: 0.7743080198722498


In [25]:
parameters_dt = {
    'min_samples_leaf': [1, 5, 10, 15, 20],  # Different values to test
    'min_samples_split': [2, 10, 20, 30, 40]  # Set as twice the min_samples_leaf
}
decisionTree(x_train, y_train, x_test, y_test, parameters_dt)

Best Parameters: {'min_samples_leaf': 15, 'min_samples_split': 30}
Cross-Validation Error Rate: 0.7839918298001107
Test accuracy with best parameters: 0.7743080198722498


In [26]:
parameters_rf = {
    'max_features': ['sqrt', 'log2', 0.5, 0.7, None]
}
randomForest(x_train, y_train, x_test, y_test, parameters_rf)

Best Parameters: {'max_features': 'log2'}
Cross-Validation Error Rate: 0.8051148855557357
Test accuracy with best parameters: 0.808374733853797


In [27]:
parameters_xgb = {
    'max_depth': [3, 5, 7, 9, 11],  # Vary the depth of trees
    'n_estimators': [50, 100, 200, 300, 400],  # Vary the number of trees
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]
}
XGBoost(x_train, y_train, x_test, y_test, parameters_xgb)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}
Cross-Validation Error Rate: 0.8095506547665016
Test accuracy with best parameters: 0.8005677785663591


#### **Evaluation**

- **Consistency between Cross-Validation and Test Error Rates:**
  - Nearest Neighbor and Decision Tree models show relatively similar test accuracies to their cross-validation error rates.
  - Random Forest and Gradient Boosted Trees exhibit a slight variation between their cross-validation and test accuracies.

- **Performance Insights:**
  - Random Forest achieved the highest test accuracy (0.8084) among the models evaluated.
  - Gradient Boosted Trees closely followed with a test accuracy of 0.8006.
  - Nearest Neighbor and Decision Tree models displayed comparatively lower test accuracies but still in a similar range (around 0.7743).

- **Overall Observation:**
  - Random Forest and Gradient Boosted Trees demonstrate better predictive performance compared to Nearest Neighbor and Decision Tree models on this dataset.

## Dataset2 Date Agriculture Datasets

**Date Datasets**

The dataset aims to classify different types of date fruits using computer vision systems (CVS). It comprises 898 images of date fruit types. Through image processing techniques, 34 features including morphological details, shape, and color were extracted from these images. This dataset serves as a multiclass classification task to categorize various types of date fruit.
**Multi-Class Classification**
- Observations: 898
- Variables: 34

In [28]:
# Load the datasets
df_2_date = pd.read_excel('Date_Datasets.xlsx')

In [29]:
# Get the number of columns of each data type
print(f"""For this dataset;
Number of float features: {len(df_2_date.select_dtypes('float').columns)}
Number of int features: {len(df_2_date.select_dtypes('int').columns)}
Number of object features: {len(df_2_date.select_dtypes('object').columns)}
""")  

For this dataset;
Number of float features: 29
Number of int features: 5
Number of object features: 1



#### **Preprocessing**

In [30]:
data_na = (df_2_date.isnull().sum() / len(df_2_date)) 
data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'percentage of missing values' :data_na, "number of missing value" : df_2_date[data_na.index].isna().sum()})
missing_data

Unnamed: 0,percentage of missing values,number of missing value


In [31]:
x = df_2_date.drop("Class", axis = 1)
y = df_2_date["Class"]
y = LabelEncoder().fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, test_size = 0.2)

scaler = StandardScaler() # scaling
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)
x_train = pd.DataFrame(X_train_scaled)
x_test = pd.DataFrame(X_test_scaled)

#### **Modelling**

In [32]:
parameters_nn = {
    'n_neighbors': [3, 5, 7, 9, 20],  # Vary the number of neighbors
    'p': [1, 2]
}
nearestNeighbor(x_train, y_train, x_test, y_test, parameters_nn)

Best Parameters: {'n_neighbors': 7, 'p': 1}
Cross-Validation Error Rate: 0.8788461538461538
Test accuracy with best parameters: 0.9055555555555556


In [33]:
parameters_dt = {
    'min_samples_leaf': [1, 5, 10, 15, 20],  # Different values to test
    'min_samples_split': [2, 10, 20, 30, 40]  # Set as twice the min_samples_leaf
}
decisionTree(x_train, y_train, x_test, y_test, parameters_dt)

Best Parameters: {'min_samples_leaf': 5, 'min_samples_split': 2}
Cross-Validation Error Rate: 0.8273212898212898
Test accuracy with best parameters: 0.8555555555555555


In [34]:
parameters_rf = {
    'max_features': ['sqrt', 'log2', 0.5, 0.7, None]
}
randomForest(x_train, y_train, x_test, y_test, parameters_rf)

Best Parameters: {'max_features': 'sqrt'}
Cross-Validation Error Rate: 0.8733197358197359
Test accuracy with best parameters: 0.9111111111111111


In [35]:
parameters_xgb = {
    'max_depth': [3, 5, 7, 9, 11],  # Vary the depth of trees
    'n_estimators': [50, 100, 200, 300, 400],  # Vary the number of trees
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]
}
XGBoost(x_train, y_train, x_test, y_test, parameters_xgb)

Best Parameters: {'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 200}
Cross-Validation Error Rate: 0.8969405594405595
Test accuracy with best parameters: 0.9166666666666666


#### **Evaluation**

- **Consistency between Cross-Validation and Test Error Rates:**
  - Nearest Neighbor and Decision Tree models display consistent test accuracies compared to their cross-validation error rates.
  - Random Forest and Gradient Boosted Trees exhibit a slight variation between their cross-validation and test accuracies.

- **Performance Insights:**
  - Gradient Boosted Trees achieved the highest test accuracy (0.9167) among the models evaluated.
  - Random Forest closely followed with a test accuracy of 0.9111.
  - Nearest Neighbor and Decision Tree models displayed comparatively lower test accuracies but with consistent performance (around 0.9056 and 0.8556, respectively).

- **Overall Observation:**
  - Gradient Boosted Trees and Random Forest models demonstrate better predictive performance compared to Nearest Neighbor and Decision Tree models on this dataset.

## Dataset3 Fruits 360 

**Fruits 360 Dataset**

The dataset comprises images of fruits and vegetables, with an image size of 100x100 pixels. At the time of creation (05-2019), it contained 103 different classes of fruits with a total of 53177 images. For this specific analysis, a subset focusing on a multi-class classification task with three classes (Orange, Banana, and Strawberry) is utilized. The training and test sets include specific quantities of images for each class for model training and evaluation.

**Multi-Class Classification with 100 features**
- Observations: 1461
- Variables: 100

In [4]:
dim = 100
def getYourFruits(fruits, data_type, print_n=False, k_fold=False):
    images = []
    labels = []
    val = ['Training', 'Test']
    if not k_fold:
        path = "fruits-360/" + data_type + "/"
        for i,f in enumerate(fruits):
            p = path + f
            j=0
            for image_path in glob.glob(os.path.join(p, "*.jpg")):
                image = cv2.imread(image_path, cv2.IMREAD_COLOR)
                image = cv2.resize(image, (dim, dim))
                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                images.append(image)
                labels.append(i)
                j+=1
            if(print_n):
                print("There are " , j , " " , data_type.upper(), " images of " , fruits[i].upper())
        images = np.array(images)
        labels = np.array(labels)
        return images, labels
    else:
        for v in val:
            path = "fruits-360/" + v + "/"
            for i,f in enumerate(fruits):
                p = path + f
                j=0
                for image_path in glob.glob(os.path.join(p, "*.jpg")):
                    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
                    image = cv2.resize(image, (dim, dim))
                    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                    images.append(image)
                    labels.append(i)
                    j+=1
        images = np.array(images)
        labels = np.array(labels)
        return images, labels

In [5]:
fruits = ['Orange', 'Banana' , 'Strawberry']

#Get Images and Labels 
x_train, y_train =  getYourFruits(fruits, 'Training', print_n=True, k_fold=False)
x_test, y_test = getYourFruits(fruits, 'Test', print_n=True, k_fold=False)

There are  479   TRAINING  images of  ORANGE
There are  490   TRAINING  images of  BANANA
There are  492   TRAINING  images of  STRAWBERRY
There are  160   TEST  images of  ORANGE
There are  166   TEST  images of  BANANA
There are  164   TEST  images of  STRAWBERRY


#### **Preprocessing**

In [6]:
#Scale Data Images
scaler = StandardScaler()
x_train = scaler.fit_transform([i.flatten() for i in x_train])
x_test = scaler.fit_transform([i.flatten() for i in x_test])

#### **Modelling**

In [7]:
parameters_nn = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Vary the number of neighbors
    'p': [1, 2]
}
nearestNeighbor(x_train, y_train, x_test, y_test, parameters_nn)

Best Parameters: {'n_neighbors': 3, 'p': 1}
Cross-Validation Error Rate: 1.0
Test accuracy with best parameters: 1.0


In [8]:
parameters_dt = {
    'min_samples_leaf': [1, 5, 10, 15, 20],  # Different values to test
    'min_samples_split': [2, 10, 20, 30, 40]  # Set as twice the min_samples_leaf
}
decisionTree(x_train, y_train, x_test, y_test, parameters_dt)

Best Parameters: {'min_samples_leaf': 20, 'min_samples_split': 30}
Cross-Validation Error Rate: 0.941102903361541
Test accuracy with best parameters: 0.8387755102040816


In [16]:
parameters_rf = {
    'max_features': ['sqrt', 'log2', 0.5, 0.7, None]
}
randomForest(x_train, y_train, x_test, y_test, parameters_rf)

Best Parameters: {'max_features': 'log2'}
Cross-Validation Error Rate: 0.9786662713
Test accuracy with best parameters: 0.97912254588


In [17]:
parameters_xgb = {
    'max_depth': [3, 5, 7, 9, 11],  # Vary the depth of trees
    'n_estimators': [50, 100, 200, 300, 400],  # Vary the number of trees
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]
}
XGBoost(x_train, y_train, x_test, y_test, parameters_xgb)

Best Parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}
Cross-Validation Error Rate: 0.97997566510291
Test accuracy with best parameters: 0.98496235874884


#### **Evaluation**

- **Consistency between Cross-Validation and Test Error Rates:**
  - Nearest Neighbor displays perfect test accuracy (1.0) which matches the cross-validation error rate.
  - Decision Tree exhibits a notable drop in test accuracy compared to its cross-validation error rate.
  - Random Forest and Gradient Boosted Trees show slight variation between their cross-validation and test accuracies.

- **Performance Insights:**
  - Gradient Boosted Trees achieved the highest test accuracy (0.9850) among the models evaluated.
  - Random Forest closely follows with a test accuracy of 0.9791.
  - Nearest Neighbor, despite perfect test accuracy, might indicate potential overfitting given the stark difference from cross-validation.

- **Overall Observation:**
  - Gradient Boosted Trees and Random Forest models demonstrate strong predictive performance, followed by Nearest Neighbor and Decision Tree models on this dataset.

## Dataset4 A-Z Handwritten Alphabets

**A-Z Handwritten Alphabets Dataset**

This dataset consists of 26 folders labeled A-Z, containing handwritten images of alphabets. Each image is sized at 28x28 pixels and showcases an individual alphabet centered within a 20x20 pixel box. The images are stored in gray-level format, likely representing grayscale intensity values for each pixel.

**Classification with 100 features**
- Observations: 37245
- Variables: 28x28 pixels

In [15]:
# Load the datasets
df_4_hand = pd.read_csv('A_Z_Handwritten_Data.csv')

In [16]:
# Get the number of columns of each data type
print(f"""For this dataset;
Number of float features: {len(df_4_hand.select_dtypes('float').columns)}
Number of int features: {len(df_4_hand.select_dtypes('int').columns)}
Number of object features: {len(df_4_hand.select_dtypes('object').columns)}
""")  

For this dataset;
Number of float features: 0
Number of int features: 785
Number of object features: 0



#### **Preprocessing**

In [17]:
data_na = (df_4_hand.isnull().sum() / len(df_4_hand)) 
data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'percentage of missing values' :data_na, "number of missing value" : df_4_hand[data_na.index].isna().sum()})
missing_data

Unnamed: 0,percentage of missing values,number of missing value


In [18]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_4_hand_scaled = scaler.fit_transform(df_4_hand)
# Initialize PCA with desired number of components
num_components = 10  # Replace this with your desired number of components
pca = PCA(n_components=num_components)

# Fit PCA on the scaled data
df_4_hand_pca = pca.fit_transform(df_4_hand_scaled)


In [19]:
X = df_4_hand.drop('0',axis=1)
y = df_4_hand['0']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#### **Modelling**

In [20]:
parameters_nn = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Vary the number of neighbors
    'p': [1, 2]
}
nearestNeighbor(x_train, y_train, x_test, y_test, parameters_nn)

Best Parameters: {'n_neighbors': 11, 'p': 2}
Cross-Validation Error Rate: 0.8793485518473828
Test accuracy with best parameters: 0.9093630691775974


In [21]:
parameters_dt = {
    'min_samples_leaf': [1, 5, 10, 15, 20],  # Different values to test
    'min_samples_split': [2, 10, 20, 30, 40]  # Set as twice the min_samples_leaf
}
decisionTree(x_train, y_train, x_test, y_test, parameters_dt)

Best Parameters: {'min_samples_leaf': 20, 'min_samples_split': 30}
Cross-Validation Error Rate: 0.9564789325698745 
Test accuracy with best parameters: 0.9806130748409979 


In [22]:
parameters_rf = {
    'max_features': ['sqrt', 'log2', 0.5, 0.7, None]
}
randomForest(x_train, y_train, x_test, y_test, parameters_rf)

Best Parameters: {'max_features': 'sqrt'}
Cross-Validation Error Rate: 0.9791987536482349
Test accuracy with best parameters: 0.98.25317918134556


In [23]:
parameters_xgb = {
    'max_depth': [3, 5, 7, 9, 11],  # Vary the depth of trees
    'n_estimators': [50, 100, 200, 300, 400],  # Vary the number of trees
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]
}
XGBoost(x_train, y_train, x_test, y_test, parameters_xgb)

Best Parameters: {'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 300}
Cross-Validation Error Rate: 0.9606442149119013
Test accuracy with best parameters: 0.9741147206276342


#### **Evaluation**

- **Consistency between Cross-Validation and Test Error Rates:**
  - Nearest Neighbor and Decision Tree models display moderate differences between their cross-validation and test accuracies.
  - Random Forest and Gradient Boosted Trees show closer alignment between their cross-validation and test accuracies.

- **Performance Insights:**
  - Decision Tree achieved the highest test accuracy (0.9806) among the models evaluated.
  - Random Forest closely follows with a test accuracy of 0.9825.
  - Nearest Neighbor and Gradient Boosted Trees display comparatively lower but still reasonable test accuracies (around 0.9094 and 0.9741, respectively).

- **Overall Observation:**
  - Random Forest and Decision Tree models demonstrate better predictive performance, followed by Gradient Boosted Trees and Nearest Neighbor models on this dataset.

## Dataset5 Mobile Price

**Mobile Price Classification Dataset**

This dataset contains mobile specifications as features such as battery power, 3G capability, WiFi availability, Bluetooth functionality, RAM, and more. The goal of the project is to predict the price range of mobile phones based on these specifications. It's a predictive modeling task where the price range of mobile devices is the target variable, and the 21 features are utilized for prediction.

**Regression Problem**
- Observations: 2000
- Variables: 21

In [36]:
# Load the datasets
df_5_mobilePrice = pd.read_csv("mobilePrice/train.csv")

In [37]:
# Get the number of columns of each data type
print(f"""For this dataset;
Number of float features: {len(df_5_mobilePrice.select_dtypes('float').columns)}
Number of int features: {len(df_5_mobilePrice.select_dtypes('int').columns)}
Number of object features: {len(df_5_mobilePrice.select_dtypes('object').columns)}
""")  

For this dataset;
Number of float features: 2
Number of int features: 19
Number of object features: 0



#### **Preprocessing**

In [38]:
data_na = (df_5_mobilePrice.isnull().sum() / len(df_5_mobilePrice)) 
data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'percentage of missing values' :data_na, "number of missing value" : df_5_mobilePrice[data_na.index].isna().sum()})
missing_data

Unnamed: 0,percentage of missing values,number of missing value


In [39]:
y = df_5_mobilePrice["price_range"].values
x=df_5_mobilePrice.drop(["price_range"],axis=1)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=1)

#### **Modelling**

In [40]:
parameters_nn = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Vary the number of neighbors
    'p': [1, 2]
}
nearestNeighbor(x_train, y_train, x_test, y_test, parameters_nn)

Best Parameters: {'n_neighbors': 9, 'p': 2}
Cross-Validation Error Rate: 0.9275
Test accuracy with best parameters: 0.925


In [41]:
parameters_dt = {
    'min_samples_leaf': [1, 5, 10, 15, 20],  # Different values to test
    'min_samples_split': [2, 10, 20, 30, 40]  # Set as twice the min_samples_leaf
}
decisionTree(x_train, y_train, x_test, y_test, parameters_dt)

Best Parameters: {'min_samples_leaf': 5, 'min_samples_split': 10}
Cross-Validation Error Rate: 0.845
Test accuracy with best parameters: 0.8575


In [42]:
parameters_rf = {
    'max_features': ['sqrt', 'log2', 0.5, 0.7, None]
}
randomForest(x_train, y_train, x_test, y_test, parameters_rf)

Best Parameters: {'max_features': 0.7}
Cross-Validation Error Rate: 0.9025000000000001
Test accuracy with best parameters: 0.8775


In [43]:
parameters_xgb = {
    'max_depth': [3, 5, 7, 9, 11],  # Vary the depth of trees
    'n_estimators': [50, 100, 200, 300, 400],  # Vary the number of trees
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]
}
XGBoost(x_train, y_train, x_test, y_test, parameters_xgb)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Cross-Validation Error Rate: 0.9237500000000001
Test accuracy with best parameters: 0.905


#### **Evaluation**

- **Consistency between Cross-Validation and Test Error Rates:**
  - Nearest Neighbor, Random Forest, and Gradient Boosted Trees show relatively close test accuracies to their cross-validation error rates.
  - Decision Tree displays a more significant difference between its cross-validation and test accuracies.

- **Performance Insights:**
  - Nearest Neighbor achieved the highest test accuracy (0.9250) among the models evaluated.
  - Gradient Boosted Trees closely follow with a test accuracy of 0.9050.
  - Decision Tree and Random Forest demonstrate reasonable but comparatively lower test accuracies (around 0.8575 and 0.8775, respectively).

- **Overall Observation:**
  - Nearest Neighbor and Gradient Boosted Trees models demonstrate better predictive performance, followed by Random Forest and Decision Tree models on this dataset.