<h3>Instructions on how to use this notebook</h3>
Note: this configuration takes as input a config file and outputs features (X) and label (y) ready to be used by a model
1. Download the imbalance-learn package:
<ul> Open an anaconda Prompt </ul>
<ul> Run <code>pip install -U imbalanced-learn</code> </ul>

1. Configure your paramters using the transform_config.ini config file
<ul> sql_query = the sql query to fetch the paramters of interest (features + labels) from the database </ul>
<ul>columns = list of column names that were used in the sql_query SELECT clause, will need it to load the data of the sql_query into a pandas dataframe </ul>
<ul> categorical_col_list = the list of column names from columns that are categorical (these will need to be encoded) </ul>
<ul> numeric_col_list = the list of column names from columns that are numerical (these will need to be normalized) </ul>
<ul> label = the label you want your model to learn to predict </ul>
<ul> labels_to_drop = list of column names that were fetched by the sql query but that you don't want to use as features (will be dropped during extraction) - optional - </ul>
<ul> top_k = top k features to use for the model </ul>
2. Run the notebook and call preprocess_and_transform_data which returns X and y of your data



In [1]:
from configparser import ConfigParser
import configparser
import psycopg2
import matplotlib.pyplot as plt
import pandas as pd 
from collections import Counter 
from sklearn import preprocessing
from imblearn.under_sampling import NearMiss
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif, chi2
from sklearn.preprocessing import OrdinalEncoder 

In [2]:
def config(filename='database.ini', section='postgresql'):
    # create a parser
    parser = ConfigParser()
    # read config file
    parser.read(filename)

    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))

    return db

In [3]:
#Establish the connection and create a curose to the database 
def connect(cfg):
    try:
        print("Attempting to connect to the database")
        conn = psycopg2.connect(**cfg)
        cursor = conn.cursor()
        print("Connected!")
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    return conn, cursor 

def close_connection(connection, cursor):
    print("Closing connection")
    cursor.close()
    connection.close()
    print("Connection closed!")

In [4]:
def fetch(cursor, query):
    """
    Fetch the data from the db 
    """
    print("Fetching query...")
    # Get the features and labels
    try:
        cursor.execute(query)
        #Get the complete result set: list of tuples where each tuple is a row from the result set 
        result_list = cursor.fetchall()
        print("Fetched!")
        return result_list
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)

In [5]:
def load(result_list, cols):
    """
    Load data in a dataframe
    """
    print("Loading data in dataframe")
    result_df = pd.DataFrame(result_list, columns=cols)
    return result_df

__Separate labels and features__

In [6]:
def get_label_and_features(df, label_name, labels_to_drop):
    """
    Return the label df and delete all labels from the original df
    Returns the label df and a copy of the original df without labels
    """
    print("Extracting labels and features...")
    y = df[label_name]
    # Normalize y 
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(y) 
    label_list = list(le.classes_)
    # Drop all the labels from 
    features_df = df.drop(columns=[label_name], inplace=False)
    for label in labels_to_drop:
        features_df = features_df.drop(columns=[label], inplace=False)
    print("Labels and features extracted!")
    return features_df, y, label_list

def encode(df,cat_cols_list):
    """
    Uses ordinal encoder to encode the cat_cols_list
    """
    print("Encoding categorical features...")
    enc = OrdinalEncoder()
    df_enc = enc.fit_transform(df)
    df_encoded = pd.DataFrame(df_enc, columns=df.columns)
    print(enc.categories_)
    return df_encoded 

def normalize(df,numeric_col_list):
    """
    Normalizes the numeric columns
    """
    print("Normalizing numerical features...")
    # Create a min-max processor object
    min_max_scaler = preprocessing.MinMaxScaler()
    # Create an object to transform the data to fit minmax processor 
    df_scaled = min_max_scaler.fit_transform(df)
    # Run normalized on the dataframe
    df_normalized = pd.DataFrame(df_scaled, columns=df.columns)

    return df_normalized
    

In [7]:
def preprocess(df, label_name, labels_to_drop, categorical_col_list, numeric_col_list):
    """
    Given the dataframe, label, labels to drop, list of categorical and numerical columns will
    (1) Extract the label of interest
    (2) Encode categorical columns
    (3) Normalize numerical columns
    """
    print("Started preprocessing...")
    # Convert numeric cols to floats 
    for col in numeric_col_list:
        
        df[col] = pd.to_numeric(df[col])
    
    features_df, y, label_list = get_label_and_features(df, label_name, labels_to_drop)
    features_df = encode(features_df, categorical_col_list)
    # Get the encoding mapping to be able to interpret model let 
    # Get the non-encoded columns
    # Get the encoded columns 
    # Create 
    num_normalized = normalize(features_df, numeric_col_list)
    print("Preprocessing done!")
    return num_normalized.values, y, num_normalized.columns, label_list, num_normalized
    

__Perform feature selection__

In [8]:
def select_features(X_train, y_train, X_test, top_k, feature_col):
    """
    Perform feature selection using the mutual information algorithm 
    """
    print("Started feature selection...")
    print(f"Selecting top {top_k} features")
    select_k_best_classif = SelectKBest(chi2, k=top_k)
    select_k_best_classif.fit(X_train, y_train)
    X_train_new = select_k_best_classif.transform(X_train)
    X_test_new = select_k_best_classif.transform(X_test)
    # Print the selected features in order
    mask = select_k_best_classif.get_support() # list of booleans 
    new_features = [] # list of k best features
    for bool, feature in zip(mask, feature_col):
        if bool:
            new_features.append(feature)
    print("Feature selection done!")
    return X_train_new, X_test_new, new_features

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

def tree_feature_select(X_train, y_train, X_test, top_k, feature_col):
    print("Started tree feature selection...")
    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(X, y)
    model = SelectFromModel(clf, prefit=True)
    X_train_new = model.transform(X_train)
    X_test_new = model.transform(X_test)
    print("Feature selection done!")
    return X_new
    

__Notebook entry point__

Inputs to the preprocessor

Connect to the database and load data in pandas

In [9]:
def load_config(config_file):
    # Read the config file 
    config = configparser.ConfigParser()
    config.read(config_file)
    # Clean up sql_query 
    sql_query = config['DEFAULT']['sql_query'].replace('\n', ' ').strip()
    columns = config['DEFAULT']['columns'].replace('\n', ' ').replace('\n', ' ').split(', ')
    categorical_col_list = config['DEFAULT']['categorical_col_list'].replace('\n', ' ').split(', ')
    numeric_col_list = config['DEFAULT']['numeric_col_list'].replace('\n', ' ').split(', ')
    label = config['DEFAULT']['label']
    labels_to_drop = config['DEFAULT']['labels_to_drop']
    if labels_to_drop:
        labels_to_drop = config['DEFAULT']['labels_to_drop'].split(', ')
    else:
        labels_to_drop = []
    top_k = int(config['DEFAULT']['top_k'])
    return sql_query, columns, categorical_col_list, numeric_col_list, label, labels_to_drop, top_k

In [10]:
def load_data():
    print("Loading data...")
    # Get the configuration file as a python dict
    cfg = config()
    # Connect to the database
    conn, cursor = connect(cfg)
    # Fetch the data
    result_list = fetch(cursor, sql_query)
    # Load results in datafram
    result_df = load(result_list, columns)
    print("Data loaded into dataframe!")
    print(f"Class imbalance check for label: {label}")
    print(Counter(result_df[label]))
    return result_df

In [11]:
def data_transform(result_df, top_k):
    print("Started data transformation...")

    # extract labels and features and preprocess features: normalize and encode
    X, y, feature_col, label_list, normalized_df = preprocess(result_df, label, labels_to_drop,categorical_col_list, numeric_col_list)

    # Perform feature selection 
#     X = select_features(X, y, top_k, feature_col)
#     X = tree_feature_select(X, y, top_k)
    print("Data transformation done! Data is ready for training.")
    return X, y, feature_col, label_list, normalized_df


In [12]:
sql_query, columns, categorical_col_list, numeric_col_list, label, labels_to_drop, top_k = load_config('transform_config.ini')

def preprocess_and_transform_data():
    result_df = load_data()
    X, y, feature_col, label_list, normalized_df = data_transform(result_df, top_k)
    return X, y, feature_col, label_list, normalized_df, result_df