## Library

In [None]:
import pandas as pd
import numpy as np
import sklearn

## Visualization

In [None]:
from sklearn.tree import plot_tree
import sklearn.tree as tree
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Models

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.preprocessing import PowerTransformer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier

from sklearn.naive_bayes import CategoricalNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.preprocessing import StandardScaler

## Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import roc_curve, auc
import statsmodels.api as sm
from sklearn.feature_selection import RFE


## Functions

In [None]:
def evaluate_classification_model(y_train, y_pred_train, y_test, y_pred_test):

    performance_df = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train),
                                         precision_score(y_train, y_pred_train),
                                         recall_score(y_train, y_pred_train)],
                               'Test': [accuracy_score(y_test, y_pred_test),
                                        precision_score(y_test, y_pred_test),
                                        recall_score(y_test, y_pred_test)]})
    
    pd.options.display.float_format = '{:.2f}'.format

    df_train = pd.DataFrame({'Real': y_train, 'Predicted': y_pred_train})
    df_test  = pd.DataFrame({'Real': y_test,  'Predicted': y_pred_test})

    return performance_df, df_train, df_test

In [None]:
def plot_correlation_heatmap(df):
    # Compute the correlation matrix
    corr = df.corr()

    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Set up the matplotlib figure
    fig, ax = plt.subplots(figsize=(12, 8))

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap='coolwarm', vmax=.8, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})

    plt.title('Correlation Heatmap')
    plt.show()

In [None]:
def clean_data(df):
    # Check for missing values
    print(df.isnull().sum())

    # Check for duplicates
    print(df.duplicated().sum())

    # Drop duplicates
    df.drop_duplicates(inplace=True)

    # Impute missing values
    # df.fillna(df.median(), inplace=True)

    # Remove outliers
    # Q1 = df.quantile(0.25)
    # Q3 = df.quantile(0.75)
    # IQR = Q3 - Q1
    # df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df

In [None]:
def get_numerical_features(df):
    """
    Returns a DataFrame containing only the numerical features of the input DataFrame.
    """
    print(df.select_dtypes(include=[float, int]).columns)
    return df.select_dtypes(include=[float, int])

In [None]:
def select_features(df, feature_list):
    """
    Returns a DataFrame containing only the selected features from the input DataFrame.
    """
    return df[feature_list]

In [None]:
def split_X_y(df, target_col):
    # Split dataframe into X and y
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    return X, y

In [None]:
def split_data(X, y, test_size=0.2, random_state=42):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [None]:
def plot_feature_importances(model, X_train):
    # Get absolute value of coefficients
    coefs = np.abs(model.coef_)
    
    # Normalize coefficients
    coefs /= np.sum(coefs)
    
    # Sort coefficients by importance
    sorted_idx = np.argsort(coefs)
    
    # Get feature names
    feature_names = X_train.columns.values
    
    plt.figure(figsize=(8, 5))
    # Plot horizontal bar chart of feature importances
    plt.barh(range(len(sorted_idx)), coefs[sorted_idx], align='center')
    plt.yticks(range(len(sorted_idx)), feature_names[sorted_idx])
    # set the tick label font size and other parameters
    plt.tick_params(axis='both', which='major', labelsize=6, length=6, width=2, direction='out', pad=8)

    plt.xlabel('Normalized Feature Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importances')
    plt.show()

In [None]:
def scale_features(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_train_scaled =pd.DataFrame(X_train_scaled, columns=X_train.columns)
    
    X_test_scaled = scaler.transform(X_test)
    X_test_scaled=pd.DataFrame(X_test_scaled, columns=X_test.columns)
    
    return X_train_scaled, X_test_scaled

In [None]:
# Encode categorical variables

def encode_categorical_cols(df):
    # Identify categorical variables
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    print("Categorical Variables:", categorical_cols)
    
    # Apply one-hot encoding
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    X_encoded = encoder.fit_transform(df[categorical_cols])
    #print(X_encoded)
    # Convert encoded features to a Pandas dataframe
    X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names(categorical_cols),index=df.index)

    # Drop original categorical columns and merge with encoded features
    df_encoded=df.drop(categorical_cols, axis=1)
    df_encoded = pd.concat([df_encoded, X_encoded_df], axis=1)
    return df_encoded

In [None]:

# # Feature selection with RFE
# model=LinearRegression()
# rfe = RFE(model, n_features_to_select=5)
# rfe.fit(X_train, y_train)

# # Print the selected features and their ranking
# print("Selected Features:")
# for i in range(len(rfe.support_)):
#     if rfe.support_[i]:
#         print(X.columns[i])
        
# print("Feature Ranking:")
# print(rfe.ranking_)

In [None]:
def feature_selection_pvalue_train(X_train, y_train, cols):
    # Fit the OLS model
    
    X_train = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train).fit()

    ## report model summary
    print(model.summary())
    
    # Get the p-values
    pvalues = model.pvalues[1:]

    print(pvalues.index)
    # Combine the p-values with the column names
    pvalues_df = pd.DataFrame({'feature': pvalues.index, 'pvalue': pvalues})

    # Filter out the columns with p-value > 0.05
    selected_features = list(pvalues_df[pvalues_df['pvalue'] < 0.05]['feature'])

    return selected_features


## Data Processing