# Pipeline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score, roc_curve
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.cluster import KMeans

In [3]:
def connect_to_data_source(data_source, column_names = None):
    """
    Connects to the specified data source and returns a Pandas DataFrame.
    
    Parameters:
    - data_source: str, path to the data source or URL
    
    Returns:
    - df: pandas DataFrame containing the data
    """
    # Example: connecting to a CSV file

    if data_source.endswith('.txt'):
        df = pd.read_csv(data_source, sep="\s+", names=column_names)
    elif data_source.endswith('data'):
        df = pd.read_csv(data_source, header=None, names=column_names)
    elif data_source.endswith('.csv'):
        df = pd.read_csv(data_source)
    else:
        raise ValueError("Unsupported data source format.")
    
    return df



In [4]:
def clean_data(df, columns_to_drop=None, threshold=0.5):
    """
    Cleans the input DataFrame by removing missing values, data entry errors, unnecessary columns, and rows.
    
    Parameters:
    - df: pandas DataFrame, input data
    - columns_to_drop: list, optional, columns to be dropped
    - threshold: float, optional, threshold for missing values, columns with missing values exceeding this threshold will be dropped
    
    Returns:
    - cleaned_df: pandas DataFrame, cleaned data
    """
    
    # Drop specified columns
    if columns_to_drop is not None:
        df = df.drop(columns=columns_to_drop)
    
    # Drop rows with any missing values
    df.dropna(inplace=True)

    # Drop duplicate instances 
    df = df.drop_duplicates()
    
    return df


In [6]:
def data_transformation(df, categorical_features=None, numerical_features=None):
    """
    Perform data transformation including feature renaming,
    one-hot encoding for categorical features, and z-score standardization
    for numerical features.

    Parameters:
        df (DataFrame): The input DataFrame.
        categorical_features (list): List of categorical feature names.
        numerical_features (list): List of numerical feature names.

    Returns:
        DataFrame: Transformed DataFrame.
    """
    # If feature lists are not provided, automatically identify them
    if categorical_features is None or numerical_features is None:
        numerical_features = df.iloc[:,:-1].select_dtypes(include=['number']).columns.tolist()
        categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

    label_encoder = LabelEncoder()
    # Perform one-hot encoding for categorical features
    for feature in categorical_features:
        df[feature] = label_encoder.fit_transform(df[feature])

    # Perform z-score standardization for numerical features
    for feature in numerical_features:
        mean = df[feature].mean()
        std = df[feature].std()
        df[feature] = (df[feature] - mean) / std

    return df


In [7]:
def perform_eda(df, numerical_features=None):
    """
    Perform exploratory data analysis including the 5-number summary,
    histograms, and boxplots for numerical variables.

    Parameters:
        df (DataFrame): The input DataFrame.
        numerical_features (list): List of numerical feature names.

    Returns:
        None
    """
    if numerical_features is None:
        numerical_features = df.select_dtypes(include=['number']).columns.tolist()

    # 5-number summary
    summary = df[numerical_features].describe().transpose()
    print("5-Number Summary:")
    print(summary)

    # Histograms and Boxplots
    for feature in numerical_features:
        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        sns.histplot(df[feature], kde=True)
        plt.title(f'Histogram of {feature}')

        plt.subplot(1, 2, 2)
        sns.boxplot(x=df[feature])
        plt.title(f'Boxplot of {feature}')

        plt.tight_layout()
        plt.show()

In [8]:
def manual_data_splitting(X, y, test_size=0.2, random_state=42):
    """
    Perform manual train-validation-test split on the data.

    Parameters:
    - X: Features DataFrame
    - y: Target Series
    - test_size: Size of the test set (default is 0.2)
    - random_state: Random seed for reproducibility (default is 42)

    Returns:
    - X_train: Training features
    - X_test: Test features
    - y_train: Training target
    - y_test: Test target
    """

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test


In [9]:
def model_selection(X, y, problem_type='classification', test_size=0.2, random_state=None):
    """
    Perform model selection using LazyPredict.

    Parameters:
    - X: Feature matrix
    - y: Target variable
    - problem_type: Type of the problem, either 'classification' or 'regression'
    - test_size: The proportion of the dataset to include in the test split
    - random_state: Controls the shuffling applied to the data before splitting

    Returns:
    - models: A DataFrame containing information about the models and their performance
    """

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    if problem_type == 'classification':
        # LazyClassifier for classification
        clf = LazyClassifier(predictions=True)
        models, _ = clf.fit(X_train, X_test, y_train, y_test)
    elif problem_type == 'regression':
        # LazyRegressor for regression
        reg = LazyRegressor(predictions=True)
        models, _ = reg.fit(X_train, X_test, y_train, y_test)
    else:
        raise ValueError("Invalid problem_type. Use 'classification' or 'regression'.")

    return models

In [10]:
def model_training(model, params, X_train, y_train, cv=5):
    """
    Train the model using Grid Search for hyperparameter tuning.

    Parameters:
    - model: The machine learning model object (e.g., RandomForestClassifier())
    - params: Dictionary containing hyperparameters to tune
    - X_train: Training features
    - y_train: Training labels
    - cv: Number of cross-validation folds

    Returns:
    - best_model: Trained model with best hyperparameters
    """

    grid_search = GridSearchCV(model, params, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    
    return best_model

# Example usage:
# best_model = model_training(RandomForestClassifier(), {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, None]}, X_train, y_train, cv=5)


In [11]:
def model_evaluation(model, X, y):
    """
    Function to evaluate the model
    """
    y_pred = model.predict(X)
    print("Accuracy:", accuracy_score(y, y_pred))
    print(classification_report(y, y_pred))
    # Additional metrics for classification
    # Example: ROC-AUC
    try:
        print("ROC-AUC Score:", roc_auc_score(y, y_pred))
        fpr, tpr, _ = roc_curve(y, y_pred)
        plt.plot(fpr, tpr)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
    except:
        pass
    plt.show()

In [12]:
def handle_outliers(df, threshold=1.5):
    """
    Handle outliers by replacing their value with the mean of the respective column.
    This function only handles numeric columns.

    Parameters:
    - df: DataFrame containing the data
    - threshold: Threshold for detecting outliers (default is 1.5)

    Returns:
    - DataFrame with outliers replaced by mean
    """
    # Copy the DataFrame to avoid modifying the original DataFrame
    df_copy = df.copy()
    
    # Get only numeric columns
    numeric_columns = df_copy.select_dtypes(include=['number']).columns
    
    # Calculate the first quartile (Q1) and third quartile (Q3) for each numeric column
    Q1 = df_copy[numeric_columns].quantile(0.25)
    Q3 = df_copy[numeric_columns].quantile(0.75)
    
    # Calculate the interquartile range (IQR) for each numeric column
    IQR = Q3 - Q1
    
    # Identify outliers for each numeric column using the IQR method
    for col in numeric_columns:
        outlier_mask = (df_copy[col] < (Q1[col] - threshold * IQR[col])) | (df_copy[col] > (Q3[col] + threshold * IQR[col]))
        
        # Replace outliers with the mean of the respective column
        df_copy.loc[outlier_mask, col] = df_copy[col].mean()
    
    return df_copy

In [13]:
def master(url, column_names = None, problem_type='classification', model = None, params = None):
    """
    Master function to execute the workflow sequence of functions with defined input parameters.
    
    Parameters:
    data_source (str): File path or URL to the data source.
    algorithms (list): List of available algorithms.
    
    Returns:
    dict: Results of the master workflow.
    """
    
    # Data Collection
    data = connect_to_data_source(url, column_names)

    # Data Cleaning
    data = clean_data(data)

    # Data Transformation
    data = data_transformation(data)

    perform_eda(data) # Exploratory Data Analysis

    # Outlier Handling
    data = handle_outliers(data)
    
    try:
        data = data.data
        X = data.features
        y = data.targets.values
    except:

        X = data.drop([column_names[-1]], axis = 1)
        y = data[column_names[-1]].values
    
    # Manual Data Splitting
    X_train, X_test, y_train, y_test = manual_data_splitting(X, y, test_size=0.2, random_state=42)
    
    # Model Selection
    if model == None:
        trained_model = model_selection(X, y, test_size=0.2, random_state=42)
    else:
        selected_model = model
    # Model Training
    trained_model = model_training(selected_model, params, X_train, y_train)

    # Model Evaluation
    evaluation_results = model_evaluation(trained_model, X_test, y_test)
    
    return evaluation_results