## Project Thesis ~ Initial Structure

## Problem Types

 - Classification 
 - Regression
 - Clustering
 - Dimension Reduction
 - Data Visualization & Analysis
 

In [1]:
# Libraries

# Core Libraries
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Helpers
import pprint
from dateutil import parser
import ijson
import json
from datetime import datetime

# Visions | Data Type Detection
from visions.functional import detect_type, infer_type, cast_to_inferred
from visions.typesets import StandardSet
from visions.typesets import CompleteSet

# YData Profiling
from ydata_profiling import ProfileReport


## Define the Problem - Clear Objectives/Requirements
Problems before requirements, requirements before solutions, solutions before design, and design before technology.

- Problem types | Classification, Regression, Clustering, Dimension Reduction, Data Visualizations & Techniques
- Target
- Models | Random Forest | Optional

In [2]:
# Problem | {Problem Type, Target Variable}
def df_objective (problem_type, target_variable):

    return {
        'problem_type' : problem_type,
        'target_feature' : target_variable
    }

## Data Gathering - Tabular Data

**File Data Types**: json, csv, xlsx, xml, dataframe,

**API**: url link, http, ftp

**Category**: Web page, Image, Audio, Video

In [3]:
# File Characteristics
def file_descr (file_path): 
    descr = {
        'FileType': os.path.splitext(file_path)[1],
        'FileSize': os.path.getsize(file_path),
    }
    return descr

# Converters | File type to Dataframe
def df_convert (file_path, file_descr):

    # JSON
    if (file_descr['FileType'] == '.json'):

        # Open File and Convert it to JSON Object
        with open(file_path, 'r') as file:
            data = ijson.items(file, 'item')
            json_object = []
            count = 0
            print(data)
            for line in data:
                json_object.append(line)
                count = count + 1
                if(count == 1000): break
        return pd.DataFrame(json_object)

    # CSV
    elif (file_descr['FileType'] == '.csv'):

        # Check whether there is a header | TODO
        return pd.read_csv(file_path)
    else: 
        return False
    
# Gather Data
def df_gather(fp):

    # File Characteristics
    file_characteristics = file_descr(fp)

    # Dataframe Initialization 
    df = df_convert(fp, file_characteristics)

    return df

## Data Profiling

### Dataset
- Shape

In [4]:
# Features, Rows
def df_shape(df):
    return {
        'features': df.shape[1],
        'rows': df.shape[0]
    }

### Features
- Role | ID, Input (Indpendant), Target (Dependant)
- Data Type | Integer, Float, Boolean, Categorical, Complex, DateTime, Object, String
- Feature Type | Categorical (Ordinal, Binary), Numerical
- Level | Nominal, Interval, Ordinal, Binary

In [5]:
# Data Type | Standard Set | Integer,  Float, Boolean, Categorical, Complex, DateTime, Object, String
# typeset = StandardSet
# def f_data_type(f):
#     # f = detect_type(f, typeset)
#     # cast_f = cast_to_inferred(f, typeset)
#     inf_f = detect_type(f, typeset)
#     print(inf_f)
#     return str(inf_f)

typeset = StandardSet()
def f_data_type(f):
    return str((detect_type(f, typeset)))

# Feature Type | Categorical, Numerical, Alphanumerical | {Data Type, Unique values, Thres}
def f_feature_type(data_type, unique_values, cat_thres=10): 

    if ((data_type == 'String' or 
         data_type == 'Integer' or 
         data_type == 'Float' or 
         data_type == 'Boolean') and unique_values < cat_thres):
        return 'Categorical'
    elif (data_type == 'Integer' or data_type == 'Float'):
        return 'Numerical'
    else: 
        return 'Alphanumerical'
    
# Qualitative Characteristics | Nominal, Interval, Ordinal, Binary
def f_qual(data_type):
    if (data_type == 'String'):
        return 'Nominal'
    elif (data_type == 'Int' or data_type == 'Float'):
        return 
    
# Role | Input (Independant), ID (Unique Identifier)
def f_role(f, f_target):
    
    f_unique_values = len(f.unique())
    f_total_values = len(f)

    if (f_unique_values / f_total_values) > 0.9:
        return 'id'
    elif (f.name == f_target): 
        return 'target'
    else:
        return 'input' 
    
## Feature Profiling  | {Feature, Objective}
def f_profile(f, f_objective):
    
    # Data Type
    data_type = f_data_type(f)

    # Role
    role = f_role(f, f_objective['target_feature'])

    # Unique Values
    unique_values = len(f.value_counts())

    # Feature Type
    feature_type = f_feature_type(data_type, unique_values, 10)

    return {
        'data_type' : data_type,
        'role' : role,
        'unique_values' : unique_values,
        'feature_type' : feature_type
    }

## All feature profiling
def df_profile(df, df_objective):
    df_profile = {}

    # Dataset
    df_profile['dataset'] = df_shape(df)

    # Target Feature
    df_profile['target_feature'] = df_objective['target_feature']

    # Features
    df_features = {}
    for f in df.columns: 
        df_features[f] = f_profile(df[f], df_objective)
    df_profile['features'] = df_features

    return df_profile 

### Feature Profiling | Samples Data

In [6]:
# House Prices
# hp_profiling = df_profile(df_hp)

In [7]:
# Modcloth
# md_profiling = df_profile(df_md)

## Exploratory Analysis & Visualization | Descriptive Statistics

### Dataset
- Duplicates


In [8]:
## Duplicates | Check if they exist
def df_duplicates(df):
    return {
        'exist' : df.duplicated().any(),
        'sum' : df.duplicated().sum()
    } 

### Univariate Analysis 

- Interval, Ordinal Statistics | Count, Mean,  Std, Min, 25%, 50%, 75%, Max
- Missing Values 
- Outliers
- Histogram
- Box Plot

In [9]:
# Statistics
def f_statistics(f, d_type, f_type):

    f_statistics = {}
    if f_type == 'Numerical' and d_type != 'String':

        descr = f.describe()
        f_statistics = {
            'count' : int(descr['count']),
            'mean' : round(descr['mean'], 2),
            'std' : round(descr['std'], 2),
            'min' : round(descr['min'], 2),
            'max' : round(descr['max'], 2)
        }
    elif f_type == 'Categorical': 
        for i, y in f.value_counts().items():
            
            # Calculate Frequency of each categorical value
            freq = round(y / len(f), 2)
            f_statistics[i] = {
                'value': y,
                'frequency': freq
            }

    return f_statistics

# Missing Values | Return: {Total Missing Values, Percentage}
def f_missing_data(f):

    # Null Values
    null_values = f.isnull().sum() 

    # Empty Values 
    empty_values = f.isin(['']).sum()

    # Missing Values
    missing_values = null_values + empty_values

    ## Percentage
    percentage = round((missing_values / len(f)), 2)
    return {
        'missing_values': missing_values,
        'percentage': percentage
    }

## Univariate | Feature | {Dataframe_Feature, Data Type, Feature Type}
def f_univariate(f, d_type, f_type): 
    f_univariate = {}

    # Statistics
    f_univariate['statistics'] = f_statistics(f, d_type, f_type)

    # Missing Values
    f_univariate['missing_data'] = f_missing_data(f)

    return f_univariate

# Univariate | Dataframe | {Dataframe, Dataframe_Profiling}
def df_univariate(df, df_prof):

    df_univariate = {}
    for f, d in df_prof['features'].items():
        df_univariate[f] = f_univariate(df[f], d['data_type'], d['feature_type'])

    df_univariate['features'] = df_univariate

    return df_univariate

### Bivariate Analysis
 - How each variable correlates to target variable

In [10]:
# Correlation Analysis | Each feature with target variable
def f_corr (f, corr_matrix):
    if f in corr_matrix:
        return round(corr_matrix[f], 2)
    else:
        return ''
    
# Bivariate Analysis | Dataframe
def df_bivariate(df, df_prof):

    df_bivariate = {}

    # Correlation Analysis
    corr_matrix = df.corr(numeric_only=True)[df_prof['target_feature']]
    for f in df_prof['features']:
        df_bivariate[f] = f_corr(f, corr_matrix)
    
    return df_bivariate

### Multivariate Analysis
- Normality -> Data should look like normal distribution
- Homoscedasticity -> 
- Linearity -> Linear Patterns

## EDA 

In [11]:
# EDA  | {Datframe, Dataframe Profiling}
def df_eda (df, df_prof):

    df_eda = {}

    # Duplicates
    df_eda['duplicates'] = df_duplicates(df)

    # Univariate
    df_eda['univariate'] = df_univariate(df, df_prof)

    # Bivariate
    df_eda['bivariate'] = df_bivariate(df, df_prof)
 
    return  df_eda

## Data Description
Dataset description in one dictionary

In [12]:
# Dataset Description
def df_describe(df, df_objective):

    # Profile 
    df_prof = df_profile(df, df_objective)

    # Statistics
    df_stat = df_eda(df, df_prof)

    # Duplicates
    df_prof['dataset']['duplicates'] = df_stat['duplicates']

    # Univariate
    for f, i in df_prof['features'].items():
        df_prof['features'][f]['eda'] = df_stat['univariate'][f]

    return df_prof

## Data Cleaning & Correcting & Formatting & Completing (Transform)

### Drop Duplicates

### Data Cleaning 
- Data Imputation ~ Handle Missing Values ~ Methods: Mean, Median, KNN, Most Frequent Value, Random Numbers between mean & std, Exploit correlated feature(s)
- Data Anomaly Detection ~ Handle Outliers ~ Interquartile Range (IQR) method 

In [13]:
# Drop Duplicates
def df_drop_duplicates(df):

    return df.drop_duplicates()

# Drop Features | Duplicates, IDs, Missing Values > Thres 
def df_drop(df, df_descr, drop_perc = 0.9): 

    # Drop Features with missing percentage > thres
    for f, details in df_descr['features'].items():

        # If missing values are above threshold | Drop
        if (details['eda']['missing_data']['percentage'] >= drop_perc):
            df = df.drop(columns=[f])
    
    # Drop IDs
    columns = []
    for f, details in df_descr['features'].items():
        if (details['role'] == 'id'):
            columns.append(f)
    df = df.drop(columns=columns)

    return df

# Data Imputation | {Dataframe, Profile, EDA}
def df_impute(df: pd.DataFrame, df_descr: dict):

    # Split Categorical and Numerical Columns
    numerical_columns = []
    categorical_columns = []

    # For each feature
    for f, p in df_descr['features'].items():

        # Split | If missing percentage is above 0
        if (p['eda']['missing_data']['missing_values'] > 0.0):
            if (p['feature_type'] == 'Numerical'):
                numerical_columns.append(f)
            elif (p['feature_type'] == 'Categorical'):
                categorical_columns.append(f)

    # Imputers
    numerical_imputer = SimpleImputer(strategy='mean')
    categorical_imputer = SimpleImputer(strategy='most_frequent')

    # Mean | Numerical
    if (len(numerical_columns) > 0):
        df[numerical_columns] = numerical_imputer.fit_transform(df[numerical_columns])

    # Mode | Categorical
    if (len(categorical_columns) > 0):
        df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])

    return df

# Handle Outliers | {Dataframe, Profile, EDA, Threshold}
def df_handle_outliers(df : pd.DataFrame, df_profile : dict, df_eda : dict, iqr_thres : float = 1.5):

    # Numerical Columns
    numerical_columns = []

    for f, p in df_profile['features'].items():
        if (p['feature_type'] == 'Numerical'):
            numerical_columns.append(f)

    for f in numerical_columns: 
        Q1 = df[f].quantile(0.25)
        Q3 = df[f].quantile(0.75)

        IQR = Q3 - Q1 
        outliers = ((df[f] < (Q1 - iqr_thres * IQR)) | (df[f] > (Q3 + iqr_thres * IQR)))
        df = df[~outliers]
    
    return df

# Dataframe Clean | {Dataframe, Description, MV Thres, Outlier Thres}
def df_clean (df, df_descr, df_objective, mv_thres = 0.9, outlier_thres = 1.5):

    # Drop Duplicates
    df_duplic = df_drop_duplicates(df)

    # Drop Features
    df_dropped = df_drop(df_duplic, df_descr, mv_thres)

    ## Describe
    df_descr_dropped = df_describe(df_dropped, df_objective)

    # # Data Imputation | 0.5 %
    df_imputed = df_impute(df_dropped, df_descr_dropped)
    return df_imputed

    # ## Describe
    # df_descr_imputed = df_describe(df_imputed, df_objective)

    # # Drop Outliers | 1.5
    # df_cleaned = df_handle_outliers(df_imputed, df_descr_imputed, outlier_thres)

    # return df_cleaned

## Feature Engineering

- Feature Encoding 
- Feature Scaling
- Feature Extraction
- Feature Selection | Either in data-preprocessing, or the classifier

In [14]:
## Feature Extraction

In [15]:
## Feature Encoding | {Feature, Encoding Type}
def f_encode(f, f_name, encode_type = 'one-hot'):

    if (encode_type == 'one-hot'):

        one_hot_encoder = pd.get_dummies(f)
        one_hot_encoder = one_hot_encoder.astype(int)

        return one_hot_encoder

In [16]:
## Feature Selection | Remove features with low or zero variance

In [17]:
# Feature Engineering | {Dataframe, Profile, EDA}
def df_feature_eng(df, df_descr): 

    # Feature Encoding for categorical variables
    for f, details in df_descr['features'].items():
        if (details['feature_type'] == 'Categorical' and 
            details['role'] != 'id' and 
            details['role'] != 'target' and 
            details['data_type'] != 'Integer' and
            details['data_type'] != 'Float'):

            # Encode
            f_encoded = f_encode(df[f], f, 'one-hot')
            
            # Change Names of encoded df


            # Concat to existing df
            df = df.drop(columns=[f])
            df = pd.concat([df, f_encoded], axis=1)

    # Feature 
    return df

## Automated Data Preprocessing

In [18]:
# Automated Data Preprocessing | {File Source, Problem Type, Target}
def auto_preproc (file_source, problem_type, target, params = {}):

    # Gather
    df = df_gather(file_source)

    # Objective
    df_object = df_objective(problem_type, target)

    # Describe
    df_descr = df_describe(df, df_object)

    # Clean
    df_cleaned = df_clean(df, df_descr, df_object)

    # Describe | Cleaned
    df_descr_cleaned = df_describe(df_cleaned, df_object)

    # Feature Engineer
    df_f_eng = df_feature_eng(df_cleaned, df_descr_cleaned)

    # Describe | Feature Engineered
    # df_descr_f_eng = df_describe(df_f_eng, df_object)

    return {
        'df' : df,
        'descr' : df_descr, 
        'cleaned' : df_cleaned, 
        'descr_cleaned' : df_descr_cleaned, 
        'f_eng' : df_f_eng, 
        # 'descr_f_eng' : df_descr_f_eng 
    }

## Real Data Testing

In [19]:
# Titanic
# tn = auto_preproc('data/sample_data/titanic/train.csv', 'classification', 'Survived')

# House Prices
# hp = auto_preproc('data/sample_data/house_price_data/train.csv', 'classification', 'SalePrice')

# Wasabi
ws = auto_preproc('data/sample_data/Wasabi/batterytesters_dataset.json', 'classification', 'commonalarm')

# Credit Fraud
# df_cc = df_gather('data/sample_data/credit_card_fraud/creditcard.csv')

# Modcloth | TOCHECK
# file_path_md = 'data/sample_data/modcloth/modcloth_final_data.json'
# df_md = pd.read_json(file_path_md, lines=True)


<_yajl2.items object at 0x7f4ae66221f0>


In [None]:
ws_df = df_gather('data/sample_data/Wasabi/batterytesters_dataset.json')

In [20]:
ws['descr']['features']['commonalarm']

{'data_type': 'Boolean',
 'role': 'target',
 'unique_values': 1,
 'feature_type': 'Categorical',
 'eda': {'statistics': {False: {'value': 1000, 'frequency': 1.0}},
  'missing_data': {'missing_values': 0, 'percentage': 0.0}}}

In [22]:
ws['f_eng']

Unnamed: 0,datetime,btname,commonalarm,False,True,Unnamed: 6,B1,B2,B5,B9,...,False.1,True.1,False.2,True.2,False.3,True.3,False.4,True.4,False.5,True.5
0,1687625779795,DCE2,False,0,1,1,0,0,0,0,...,0,1,1,0,1,0,0,1,1,0
1,1687625779795,LC2,False,0,1,0,0,0,0,0,...,1,0,0,1,0,1,0,1,1,0
2,1687625779795,LC3,False,0,1,0,0,1,0,0,...,1,0,0,1,0,1,0,1,0,1
3,1687625779795,LC4,False,0,1,1,0,0,0,0,...,1,0,1,0,1,0,0,1,0,1
4,1687625779795,LC5,False,0,1,0,0,0,1,0,...,1,0,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1687627812881,2CH1,False,0,1,0,0,0,0,1,...,1,0,0,1,0,1,1,0,0,1
996,1687627812881,2CH2,False,0,1,0,0,0,0,1,...,1,0,0,1,0,1,1,0,0,1
997,1687627812881,2CH3,False,0,1,0,0,0,0,0,...,1,0,0,1,0,1,0,1,0,1
998,1687627812881,2CH4,False,0,1,0,0,0,0,0,...,1,0,0,1,0,1,0,1,0,1


## Utils

In [None]:
# Utils
for f, d in hp_descr_2['features'].items():

    if (d['eda']['missing_data']['percentage'] > 0.0):
        print('yes')

## Model Testing