In [15]:
import pandas as pd
import pickle
from typing import List

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils.multiclass import type_of_target

In [16]:
def get_important_features(X: pd.DataFrame, target_column, threshold: float = 0.01) -> List[str]:
 
    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Determine if the task is classification or regression
    target_type = type_of_target(y)


    if target_type in ['binary', 'multiclass']:
        model = RandomForestClassifier(random_state=42)
    elif target_type in ['continuous', 'continuous-multioutput']:
        model = RandomForestRegressor(random_state=42)
    else:
        raise ValueError("Unsupported target type: {}".format(target_type))

    model.fit(X, y)
    # Get feature importance
    importance = model.feature_importances_

    feature_importance = pd.Series(importance, index=X.columns).sort_values(ascending=False)
    # Select features above the importance threshold
    important_features = feature_importance[feature_importance > threshold].index.tolist()
    
    return important_features

In [17]:
def get_model(model_name):

    if model_name == '1':
        model = KNeighborsRegressor()
    elif model_name == '2':
        model = KNeighborsClassifier()
    elif model_name == '3':
        model = DecisionTreeRegressor(random_state=42)
    elif model_name == '4':
        model = DecisionTreeClassifier(random_state=42)
    elif model_name == '5':
        model = RandomForestRegressor(random_state=42)
    elif model_name == '6':
        model = RandomForestClassifier(random_state=42)
    elif model_name == '7':
        model = LinearRegression()
    elif model_name == '8':
        model = LogisticRegression(random_state=42)
    else:
        raise ValueError("Unsupported model type: {}".format(model_name))
    return model

In [29]:
def get_metrices(model, X_test, y_test, problem_type):
   
    metrics = {}

    if problem_type == 'regression':

        y_pred = model.predict(X_test)
        
        # Regression metrics
        metrics['MSE'] = mean_squared_error(y_test, y_pred)
        metrics['RMSE'] = mean_squared_error(y_test, y_pred, squared=False)
        metrics['MAE'] = mean_absolute_error(y_test, y_pred)
        metrics['R-squared'] = r2_score(y_test, y_pred)

    
    elif problem_type == 'classification':

        y_pred = model.predict(X_test)
        
        # Classification metrics
        metrics['Accuracy'] = accuracy_score(y_test, y_pred)
        metrics['Precision'] = precision_score(y_test, y_pred, average='weighted')
        metrics['Recall'] = recall_score(y_test, y_pred, average='weighted')
        metrics['F1-Score'] = f1_score(y_test, y_pred, average='weighted')
        # Confusion Matrix
        metrics['Confusion Matrix'] = confusion_matrix(y_test, y_pred)

    return metrics

In [19]:
def get_pickle_filename(model_name):
    filenames = {
        '1': "KNeighborsRegressor_model_pickle.pkl",
        '2': "KNeighborsClassifier_model_pickle.pkl",
        '3': "DecisionTreeRegressor_model_pickle.pkl",
        '4': "DecisionTreeClassifier_model_pickle.pkl",
        '5': "RandomForestRegressor_model_pickle.pkl",
        '6': "RandomForestClassifier_model_pickle.pkl",
        '7': "LinearRegression_model_pickle.pkl",
        '8': "LogisticRegression_model_pickle.pkl"
    }
    return filenames[model_name]



if __name__ == "__main__":

    dataset_path = input("Enter file path: ")
    label_column = input("Enter label name: ")
    print("\n\n=======================")
    print("1. KNN - regression\n2. KNN - classifier\n3. DT - Regression\n4. DT - Classifier\n5. RF - Regression\n6. RF - Classifier\n7. Linear Regression\n8. Logistic Regression")
    print("=======================\n\n")

    model_name = int(input("Select the model from above list (enter the number): "))

    df = pd.read_csv(dataset_path)

    important_features = get_important_features(df, label_column)


Enter file path:  winequality-red.csv
Enter label name:  quality




1. KNN - regression
2. KNN - classifier
3. DT - Regression
4. DT - Classifier
5. RF - Regression
6. RF - Classifier
7. Linear Regression
8. Logistic Regression




Select the model from above list (enter the number):  4


In [21]:
# Filter the dataframe to keep only important features for model fitting
df_filtered = df[important_features + [label_column]]

X = df_filtered.drop(columns=[label_column])
y = df_filtered[label_column]

model = get_model(str(model_name))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Hyperparameter tuning using GridSearchCV
if str(model_name) == '1':  # KNN - regression
    param_grid = {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    }
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

elif str(model_name) == '2':  # KNN - classifier
    param_grid = {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    }
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

elif str(model_name) == '3':  # DT - Regression
    param_grid = {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

elif str(model_name) == '4':  # DT - Classifier
    param_grid = {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

elif str(model_name) == '5':  # RF - Regression
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

elif str(model_name) == '6':  # RF - Classifier
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

elif str(model_name) == '7':  # Linear Regression
    param_grid = {
        'normalize': [True, False],
        'fit_intercept': [True, False]
    }
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

elif str(model_name) == '8':  # Logistic Regression
    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1.0, 10.0],
        'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga']
    }
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

else:
    raise ValueError("Invalid model selection.")

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# model.fit(X_train, y_train)

regression_models = [1, 3, 5, 7]
classification_models = [2, 4, 6, 8]

if model_name in regression_models:
    problem_type = 'regression'
elif model_name in classification_models:
    problem_type = 'classification'
else:
    raise ValueError("Invalid user input. Expected values are 1 to 8.")






In [31]:
# Save the model as a pickle file
output_model_path = get_pickle_filename(str(model_name))
print(f"\nsaving the pickledmodel in{output_model_path}\n")
with open(output_model_path, 'wb') as file:
    pickle.dump(best_model, file)


metrics = get_metrices(best_model, X_test, y_test, problem_type)
print(f"Model Evaluation Metrics ({problem_type}):")
print(metrics)


saving the pickledmodel inDecisionTreeClassifier_model_pickle.pkl

Model Evaluation Metrics (classification):
{'Accuracy': 0.559375, 'Precision': 0.558787563938619, 'Recall': 0.559375, 'F1-Score': 0.5583180489101541, 'Confusion Matrix': array([[ 0,  0,  1,  0,  0,  0],
       [ 0,  3,  3,  4,  0,  0],
       [ 0,  3, 87, 37,  3,  0],
       [ 1,  5, 37, 74, 13,  2],
       [ 0,  1,  2, 21, 15,  3],
       [ 0,  0,  0,  2,  3,  0]], dtype=int64)}
