## Model Selection for Data Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier



# Check if required

from scipy.stats import f_oneway

from sklearn.metrics import balanced_accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

### Data Reading

In [2]:
# Reading the newly uploaded CSV file
file_path = 'C:/Users/Soumiz/Anaconda/USD/full_reduced_data.csv'
full_reduced_data = pd.read_csv(file_path)

#### Prepare Data for Training and Testing

<Font size=5px; color='blue'> Dataset is imbalnced <br>



##### Addressing Imbalance dataset Issue


<Font size=3px; color='darkblue'> We decided to downsample other classes <br>
    Target class has minimum occurance <br>
    Hence, other classes are divided into multiple sets of same size <br>
    Then one file from each class are combined to get a balanced dataset for evaluation



##### Downsampling  the other two classes to create balance. 

In [3]:

# Split the data into three separate files based on Diabetes_012 value
dbts_0 = full_reduced_data[full_reduced_data['Diabetes_012'] == 0]
dbts_1 = full_reduced_data[full_reduced_data['Diabetes_012'] == 1]
dbts_2 = full_reduced_data[full_reduced_data['Diabetes_012'] == 2]

# Save these files
dbts_0.to_csv('dbts_0.csv', index=False)
dbts_1.to_csv('dbts_1.csv', index=False)
dbts_2.to_csv('dbts_2.csv', index=False)

# Count the number of rows for each data file
count_0 = len(dbts_0)
count_1 = len(dbts_1)
count_2 = len(dbts_2)

print(f"Number of rows in dbts_0: {count_0}")
print(f"Number of rows in dbts_1: {count_1}")
print(f"Number of rows in dbts_2: {count_2}")

# Find the smallest file length
smallest_length = min(count_0, count_1, count_2)

print(f"Smallest length: {smallest_length}")

# Function to partition a dataframe into smaller parts
def partition_dataframe(df, size, file_prefix):
    num_parts = len(df) // size
    for i in range(num_parts + 1):
        part = df[i*size:(i+1)*size]
        if not part.empty:
            part.to_csv(f'{file_prefix}_p{i+1}.csv', index=False)

# Partition the dataframes
partition_dataframe(dbts_0, smallest_length, 'dbts_0')
partition_dataframe(dbts_1, smallest_length, 'dbts_1')
partition_dataframe(dbts_2, smallest_length, 'dbts_2')

print("Data partitioning complete.")


Number of rows in dbts_0: 167650
Number of rows in dbts_1: 3729
Number of rows in dbts_2: 26215
Smallest length: 3729
Data partitioning complete.


#### Combining files from class 0, 1 and 2 using all possible combination to get large amount of datasets 

<Font size=3px; color='darkgreen'> Last file of both class 0 and class 2 are skipped as they are not having same dimention as of class 1.  

In [None]:
import pandas as pd

# Number of files of different classes:
x_values = range(1,45)  # Adjust the range as needed
y_values = range(1,8)  # Adjust the range as needed

def load_and_combine_files(x, y):
    # Define file paths
    file1 = f'dbts_0_p{x}.csv'
    file2 = 'dbts_1_p1.csv'
    file3 = f'dbts_2_p{y}.csv'

    # Load the data
    data1 = pd.read_csv(file1)
    data2 = pd.read_csv(file2)
    data3 = pd.read_csv(file3)

    # Concatenate the data
    combined_data = pd.concat([data1, data2, data3], ignore_index=True)
    
    return combined_data


count=0
for x in x_values:
    for y in y_values:
        combined_data = load_and_combine_files(x, y)
        # You can now use combined_data for further analysis
        # For example, save the combined data to a new file
        combined_data.to_csv(f'C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_{x}_{y}.csv', index=False)

        '''# Print the shape of the combined data for verification
        print(f'Combined data for x={x} and y={y} has shape: {combined_data.shape}')'''
        count+=1
print("Total file created",count)

<Font size=3px; color='darkblue'>
    
### Model Testing and Evaluation  



#### Models selected for Testing and Evaluation

<span style="color: blue;"> 1. Logistic Regression  </span> <br>
Pros: Simple, interpretable, works well with linearly separable data.  <br>
Cons: Might not capture complex relationships in the data.  <br>
Usage: Good starting point, especially for binary classification and when interpretability is important.  <br>
    
<span style="color: blue;">2. Random Forest  </span> <br>
Pros: Handles high-dimensional data well, robust to overfitting, interpretable (feature importance).  <br>
Cons: Computationally intensive for large datasets.  <br>
Usage: Good for datasets with many features, captures complex interactions, provides feature importance.  <br>

<span style="color: blue;">3. Gradient Boosting (e.g., XGBoost, LightGBM)  </span> <br>
Pros: High accuracy, handles missing values, feature importance.  <br>
Cons: Requires careful tuning, longer training times.  <br>
Usage: Good for improving performance over simpler models, handles complex data well.  <br>
    
<span style="color: blue;">4. Support Vector Machine (SVM)  </span> <br>
Pros: Effective in high-dimensional spaces, robust to overfitting (especially with the kernel trick).  <br>
Cons: Computationally intensive for large datasets, less interpretable.  <br>
Usage: Effective for smaller to medium-sized datasets, especially with non-linear decision boundaries.  <br>
    
<span style="color: blue;">5. Artificial Neural Networks (ANN)  </span> <br>
Pros: Captures complex patterns, scalable to large datasets.  <br>
Cons: Requires large amounts of data, computationally intensive, less interpretable.  <br>
Usage: Good for very large datasets and complex patterns, but requires tuning and computational resources.  <br>

<span style="color: blue;">6. K-Nearest Neighbors (KNN)  </span> <br>
Pros: Simple, no training phase.  <br>
Cons: Computationally expensive during prediction, sensitive to irrelevant features.  <br>
Usage: Good for small datasets with clear clusters, not suitable for high-dimensional data.  <br>

<span style="color: blue;">7. Naive Bayes  </span> <br>
Pros: Simple, fast, works well with small datasets and text classification.  <br>
Cons: Assumes feature independence (which might not hold), less effective for complex datasets.  <br>
Usage: Effective for text classification, smaller datasets, and where independence assumption is reasonable.  <br>


    


<Font size=3px; color='darkblue'>


#### Algorithm for single dataset

#### Create Training and Testing Dataset

In [5]:
# Read the data file
file_path = 'C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_1_1.csv'
data = pd.read_csv(file_path)

# Split data into train and test sets
X = data.drop(columns=['Diabetes_012'])
y = data['Diabetes_012']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Evaluate Models 

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


# List of models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
    'SVM': SVC(kernel='linear', probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)
}

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    if hasattr(model, "predict_proba"):
        y_pred_prob = model.predict_proba(X_test)
        auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
    else:
        auc = None
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    return accuracy, precision, recall, f1, auc

# Evaluate all models
results = {}
for name, model in models.items():
    accuracy, precision, recall, f1, auc = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC-ROC': auc
    }

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).T

# Identify the best model for each metric
best_models_per_metric = {
    'Accuracy': results_df['Accuracy'].idxmax(),
    'Precision': results_df['Precision'].idxmax(),
    'Recall': results_df['Recall'].idxmax(),
    'F1 Score': results_df['F1 Score'].idxmax(),
    'AUC-ROC': results_df['AUC-ROC'].idxmax() if 'AUC-ROC' in results_df.columns else None
}

# Remove AUC-ROC if all are None
if best_models_per_metric['AUC-ROC'] is None:
    del best_models_per_metric['AUC-ROC']

# Count the number of times each model is the best across different metrics
model_counts = pd.Series(best_models_per_metric.values()).value_counts()

# Determine which metric has the highest count of best-performing models
max_count = model_counts.max()
best_models_by_count = model_counts[model_counts == max_count].index.tolist()

# Print results

# Display results
results_df = pd.DataFrame(results).T
print(results_df)

# Display Performance Details
print("Best models by each metric:")
print(best_models_per_metric)
print("\nModel counts:")
print(model_counts)

# Print Best Model
if len(best_models_by_count) == 1:
    print(f"\nBest model based on highest count: {best_models_by_count[0]}")
elif len(best_models_by_count) > 1:
    print(f"\nMultiple models have the highest count: {', '.join(best_models_by_count)}")
else:
    print("\nNo significant model.")


                     Accuracy  Precision    Recall  F1 Score   AUC-ROC
Logistic Regression  0.498213   0.492685  0.498213  0.488772  0.690130
Random Forest        0.459786   0.458018  0.459786  0.458731  0.644031
Gradient Boosting    0.500894   0.500227  0.500894  0.497710  0.690707
SVM                  0.484361   0.480112  0.484361  0.463826  0.674263
KNN                  0.424039   0.418547  0.424039  0.417092  0.605976
Naive Bayes          0.478105   0.470046  0.478105  0.458686  0.671191
Neural Network       0.494191   0.500032  0.494191  0.495642  0.684694
Best models by each metric:
{'Accuracy': 'Gradient Boosting', 'Precision': 'Gradient Boosting', 'Recall': 'Gradient Boosting', 'F1 Score': 'Gradient Boosting', 'AUC-ROC': 'Gradient Boosting'}

Model counts:
Gradient Boosting    5
Name: count, dtype: int64

Best model based on highest count: Gradient Boosting


result of file 1_2
|                    | Accuracy  |  Precision    |   Recall | F1 Score
|--------------------|-----------|--------------|--------|----------|  
| Logistic Regression | 0.490617 |  0.480787 | 0.490617 | 0.478728 |
Random Forest     |   0.478999 |  0.476718 | 0.478999 | 0.477755 |
Gradient Boosting  |  0.504021 |  0.500586 | 0.504021 |  0.497561 |
SVM                |  0.496425 |  0.492247 | 0.496425 | 0.472999 |
KNN                |  0.437891 |  0.435411 | 0.437891 | 0.431071 |
Naive Bayes       |   0.484361 |  0.474064 | 0.484361 | 0.459248 |
Neural Network    |   0.496872 |  0.484121 | 0.496872 | 0.469677 |


 

result of file 3_2
|                    | Accuracy | Precision  |  Recall | F1 Score|
|--------------------|-----------|--------------|--------|----------| 
|Logistic Regression | 0.478105 |  0.463749 | 0.478105 | 0.463690|
|Random Forest       | 0.443700 |  0.443413 | 0.443700 | 0.443526|
|Gradient Boosting   | 0.475871 |  0.466678 | 0.475871 | 0.467335|
|SVM                 | 0.478999 |  0.467690 | 0.478999 | 0.446045|
|KNN                 | 0.423146 |  0.421522 | 0.423146 | 0.418178|
|Naive Bayes         | 0.469616 |  0.455770 | 0.469616 | 0.445430|
|Neural Network      | 0.485255 |  0.478683 | 0.485255 | 0.479133|

<Font size=3px; color='darkblue'>
    
### Model Testing and Evaluation  <br>
    This is used for model selection in this project

#### 30 randomly selected datasets are used from the entire database for evaluation
    
The model demonstrated best performance for maximum number of datasets will be selected as the 
    <b> Best fit model</b>
    for this classification 

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import time
import random


# Record the start time
start_time = time.time()

# Base directory and file pattern
base_dir = 'C:/Users/Soumiz/Anaconda/USD/Combined Data/'
file_pattern = 'combined_data_{i}_{j}.csv'

# List to hold best models
best_models = []

# Initializing the used_index list
used_index = []

p = 46 # total number of files in class 0
q = 8   # total number of files in class 2 
k=30 # Total number of dataset will be used for evaluation 

# Repeate Operation for ten times 
for num in range(k):   
    # Create a list of numbers from 1 to x with step height 1 for selection of dataset 
    m = list(range(1, p))
    n= list(range(1, q))
   
    # Randomly select one number from each list as part of file name format
    temp1 = random.choice(m)
    temp2 = random.choice(n)
   
    if (temp1 * 10 + temp2) in used_index:
        k-=1
        continue
    else:
        # Appending the calculated value to the list
        used_index.append(temp1 * 10 + temp2)

    # Generate the file path
    file_path = os.path.join(base_dir, file_pattern.format(i=temp1, j=temp2))
    # Read the data file and append to the list
    try:
        data = pd.read_csv(file_path)
        # Split data into train and test sets
        X = data.drop(columns=['Diabetes_012'])
        y = data['Diabetes_012']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # List of models to evaluate
        models = {
            'Logistic Regression': LogisticRegression(max_iter=1000),
            'Random Forest': RandomForestClassifier(n_estimators=100),
            'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
            'SVM': SVC(kernel='linear', probability=True),
            'KNN': KNeighborsClassifier(n_neighbors=5),
            'Naive Bayes': GaussianNB(),
            'Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)
        }

        # Function to evaluate models
        def evaluate_model(model, X_train, X_test, y_train, y_test):
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            if hasattr(model, "predict_proba"):
                y_pred_prob = model.predict_proba(X_test)
                auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
            else:
                auc = None

            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')

            return accuracy, precision, recall, f1, auc

        # Evaluate all models
        results = {}
        for name, model in models.items():
            accuracy, precision, recall, f1, auc = evaluate_model(model, X_train, X_test, y_train, y_test)
            results[name] = {
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1,
                'AUC-ROC': auc
            }

        # Convert results to DataFrame for better visualization
        results_df = pd.DataFrame(results).T

        # Identify the best model for each metric
        best_models_per_metric = {
            'Accuracy': results_df['Accuracy'].idxmax(),
            'Precision': results_df['Precision'].idxmax(),
            'Recall': results_df['Recall'].idxmax(),
            'F1 Score': results_df['F1 Score'].idxmax(),
            'AUC-ROC': results_df['AUC-ROC'].idxmax() if 'AUC-ROC' in results_df.columns else None
        }

        # Remove AUC-ROC if all are None
        if best_models_per_metric['AUC-ROC'] is None:
            del best_models_per_metric['AUC-ROC']

        # Count the number of times each model is the best across different metrics
        model_counts = pd.Series(best_models_per_metric.values()).value_counts()

        # Determine which metric has the highest count of best-performing models
        max_count = model_counts.max()
        best_models_by_count = model_counts[model_counts == max_count].index.tolist()
        print(temp1,"  ",temp2,"  ","\n",best_models_by_count)

        # Save Best Model name to final list of best_models
        if len(best_models_by_count) == 1:
            best_models.append(best_models_by_count[0])
        elif len(best_models_by_count) > 1 & len(best_models_by_count)< len(model_counts):
            best_models.extend(best_models_by_count)
        else:
            best_models.append('Null')  # Handle the case where no significant models are found

        print(f"Successfully read {file_path}")

    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except pd.errors.EmptyDataError:
        print(f"File {file_path} is empty.")
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

# Determine which model has the highest count of best-performing metrics
models_count=pd.Series(best_models).value_counts()

# Convert list to Series and count occurrences of each model
models_count = pd.Series(best_models).value_counts()

# Find the maximum count
max_count = models_count.max()

# Filter models_count to get model names with the highest count
models_with_max_count = models_count[models_count == max_count].index.tolist()


# Print Best Model

if len(models_with_max_count)==1:
    print("The Best model based on analysis:",models_with_max_count)
elif len(models_with_max_count)<len(best_models):
        print(f"\nMultiple models have the highest count: {', '.join(models_with_max_count)}")
else:
        print("\nNo significant model.")  # Handle the case where no significant models are found


        
# Calculate and print the total execution time
end_time = time.time()
execution_time = end_time - start_time
print(f"Total execution time: {execution_time} seconds")

### Models tested on all 308 created datasets which are balanced 


In [7]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# Base directory and file pattern
base_dir = 'C:/Users/Soumiz/Anaconda/USD/Combined Data/'
file_pattern = 'combined_data_{i}_{j}.csv'

# List to hold best models
best_models = []

# Loop through the range of i and j
for i in range(1, 46):   # replace by range(1,46) for full range
    for j in range(1, 8):   # replace by range(1,8) for full range
        # Generate the file path
        file_path = os.path.join(base_dir, file_pattern.format(i=i, j=j))
        # Read the data file and append to the list
        try:
            data = pd.read_csv(file_path)
            # Split data into train and test sets
            X = data.drop(columns=['Diabetes_012'])
            y = data['Diabetes_012']
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            
            # List of models to evaluate
            models = {
                'Logistic Regression': LogisticRegression(max_iter=1000),
                'Random Forest': RandomForestClassifier(n_estimators=100),
                'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
                'SVM': SVC(kernel='linear', probability=True),
                'KNN': KNeighborsClassifier(n_neighbors=5),
                'Naive Bayes': GaussianNB(),
                'Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)
            }

            # Function to evaluate models
            def evaluate_model(model, X_train, X_test, y_train, y_test):
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                if hasattr(model, "predict_proba"):
                    y_pred_prob = model.predict_proba(X_test)
                    auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
                else:
                    auc = None

                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')

                return accuracy, precision, recall, f1, auc

            # Evaluate all models
            results = {}
            for name, model in models.items():
                accuracy, precision, recall, f1, auc = evaluate_model(model, X_train, X_test, y_train, y_test)
                results[name] = {
                    'Accuracy': accuracy,
                    'Precision': precision,
                    'Recall': recall,
                    'F1 Score': f1,
                    'AUC-ROC': auc
                }

            # Convert results to DataFrame for better visualization
            results_df = pd.DataFrame(results).T

            # Identify the best model for each metric
            best_models_per_metric = {
                'Accuracy': results_df['Accuracy'].idxmax(),
                'Precision': results_df['Precision'].idxmax(),
                'Recall': results_df['Recall'].idxmax(),
                'F1 Score': results_df['F1 Score'].idxmax(),
                'AUC-ROC': results_df['AUC-ROC'].idxmax() if 'AUC-ROC' in results_df.columns else None
            }

            # Remove AUC-ROC if all are None
            if best_models_per_metric['AUC-ROC'] is None:
                del best_models_per_metric['AUC-ROC']

            # Count the number of times each model is the best across different metrics
            model_counts = pd.Series(best_models_per_metric.values()).value_counts()

            # Determine which metric has the highest count of best-performing models
            max_count = model_counts.max()
            best_models_by_count = model_counts[model_counts == max_count].index.tolist()
            print(best_models_by_count)

            # Save Best Model name to final list of best_models
            if len(best_models_by_count) == 1:
                best_models.append(best_models_by_count[0])
            elif len(best_models_by_count) > 1 & len(best_models_by_count)< len(model_counts):
                best_models.extend(best_models_by_count)
            else:
                best_models.append('Null')  # Handle the case where no significant models are found

            print(f"Successfully read {file_path}")

        except FileNotFoundError:
            print(f"File {file_path} not found.")
        except pd.errors.EmptyDataError:
            print(f"File {file_path} is empty.")
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

# Determine which model has the highest count of best-performing metrics
models_count=pd.Series(best_models).value_counts()

# Convert list to Series and count occurrences of each model
models_count = pd.Series(best_models).value_counts()

# Find the maximum count
max_count = models_count.max()

# Filter models_count to get model names with the highest count
models_with_max_count = models_count[models_count == max_count].index.tolist()


# Print Best Model

if len(models_with_max_count)==1:
    print("The Best model based on analysis:",best_models[0])
else:
    if len(models_with_max_count) == 1:
        print(f"\nBest model based on highest count: {best_models_by_count[0]}")
    elif len(models_with_max_count)<len(best_models):
        print(f"\nMultiple models have the highest count: {', '.join(models_with_max_count)}")
    else:
        print("\nNo significant model.")  # Handle the case where no significant models are found

# Calculate and print the total execution time
end_time = time.time()
execution_time = end_time - start_time
print(f"Total execution time: {execution_time} seconds")

1    1    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_1_1.csv
1    2    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_1_2.csv
1    3    
 ['Logistic Regression', 'Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_1_3.csv
1    4    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_1_4.csv
1    5    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_1_5.csv
1    6    
 ['Neural Network']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_1_6.csv
1    7    
 ['Neural Network']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_1_7.csv
2    1    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_2_1.csv
2    2    
 ['SVM', 'Neural Network']
Successfu

10    6    
 ['Logistic Regression']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_10_6.csv
10    7    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_10_7.csv
11    1    
 ['Logistic Regression']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_11_1.csv
11    2    
 ['SVM']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_11_2.csv
11    3    
 ['Logistic Regression']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_11_3.csv
11    4    
 ['Logistic Regression', 'Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_11_4.csv
11    5    
 ['Logistic Regression', 'Neural Network']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_11_5.csv
11    6    
 ['Logistic Regression']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_11_6.csv
11    7    

20    4    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_20_4.csv
20    5    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_20_5.csv
20    6    
 ['Naive Bayes', 'Neural Network']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_20_6.csv
20    7    
 ['Neural Network']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_20_7.csv
21    1    
 ['Logistic Regression', 'Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_21_1.csv
21    2    
 ['SVM', 'Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_21_2.csv
21    3    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_21_3.csv
21    4    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_21_4.csv
21    5  

30    2    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_30_2.csv
30    3    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_30_3.csv
30    4    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_30_4.csv
30    5    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_30_5.csv
30    6    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_30_6.csv
30    7    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_30_7.csv
31    1    
 ['Logistic Regression', 'Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_31_1.csv
31    2    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_31_2.csv
31    3    
 ['Logistic R

39    7    
 ['Logistic Regression']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_39_7.csv
40    1    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_40_1.csv
40    2    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_40_2.csv
40    3    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_40_3.csv
40    4    
 ['Logistic Regression', 'Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_40_4.csv
40    5    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_40_5.csv
40    6    
 ['Logistic Regression']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_40_6.csv
40    7    
 ['Gradient Boosting']
Successfully read C:/Users/Soumiz/Anaconda/USD/Combined Data/combined_data_40_7.csv
41    1    
 ['Logist

NameError: name 'end_time' is not defined

### Part Results for better understanding and inclusion in report

In [14]:
best_models

['Gradient Boosting',
 'Gradient Boosting',
 'Logistic Regression',
 'Gradient Boosting',
 'Gradient Boosting',
 'Gradient Boosting',
 'Neural Network',
 'Neural Network',
 'Gradient Boosting',
 'SVM',
 'Neural Network',
 'Logistic Regression',
 'Naive Bayes',
 'Logistic Regression',
 'Naive Bayes',
 'Neural Network',
 'Neural Network',
 'Logistic Regression',
 'Gradient Boosting',
 'Neural Network',
 'Logistic Regression',
 'Neural Network',
 'Gradient Boosting',
 'Logistic Regression',
 'Logistic Regression',
 'Neural Network',
 'Neural Network',
 'Gradient Boosting',
 'Logistic Regression',
 'Gradient Boosting',
 'Logistic Regression',
 'Logistic Regression',
 'Gradient Boosting',
 'Gradient Boosting',
 'Gradient Boosting',
 'Logistic Regression',
 'Logistic Regression',
 'Logistic Regression',
 'Logistic Regression',
 'Gradient Boosting',
 'Logistic Regression',
 'SVM',
 'Logistic Regression',
 'Neural Network',
 'Logistic Regression',
 'Logistic Regression',
 'Neural Network',
 'L

In [8]:
# Determine which model has the highest count of best-performing metrics
models_count # pd.Series(best_models).value_counts()



Gradient Boosting      188
Logistic Regression     99
Neural Network          42
Naive Bayes             12
SVM                     10
Name: count, dtype: int64

In [10]:
# Find the maximum count
max_count # models_count.max()



188

In [11]:
# Filter models_count to get model names with the highest count
models_with_max_count # models_count[models_count == max_count].index.tolist()




['Gradient Boosting']

In [13]:
# Print Best Model

if len(models_with_max_count)==1:
    print("The Best model based on analysis:",best_models[0])
else:
    if len(models_with_max_count) == 1:
        print(f"\nBest model based on highest count: {best_models_by_count[0]}")
    elif len(models_with_max_count)<len(best_models):
        print(f"\nMultiple models have the highest count: {', '.join(models_with_max_count)}")
    else:
        print("\nNo significant model.")  # Handle the case where no significant models are found

The Best model based on analysis: Gradient Boosting



#### From the result, we can conclude, Gradient Boosting is the best method.

<br>-----------------------------------------------------------------------------------------------------------<br>
<div style="font-size: 15px; font-family: Verdana, Geneva, sans-serif; color: darkgreen;">  Hence,  
    <b>  Gradient Boosting</b> 
         is the best method for the dataset under use.<br>-----------------------------------------------------------------------------------<br>
</div>
<div style="font-size: 15px; font-family: Verdana, Geneva, sans-serif; color: darkgreen;">    Now we need to do tunning of the model for 
</div>
 <div style="font-size: 18px; font-family: 'Courier New', Courier, monospace; color: darkgreen;"><b>_________optimum performance_________</b>
    <br>-----------------------------------------------------------------------------------<br><br> 
</div>
<br>

### <Font color='Green'> ------------------------------------------------------------ END of Model Selection -----------------------------------------------------------