## Project Thesis ~ Initial Structure

## Problem Types

 - Classification 
 - Regression
 - Clustering
 - Dimension Reduction
 - Data Visualization & Analysis
 

In [28]:
# Libraries

# Core Libraries
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Helpers
import pprint
from dateutil import parser
import ijson
import json
from datetime import datetime

# Visions | Data Type Detection
from visions.functional import detect_type, infer_type, cast_to_inferred
from visions.typesets import StandardSet
from visions.typesets import CompleteSet

# YData Profiling
from ydata_profiling import ProfileReport


## Define the Problem - Clear Objectives/Requirements
Problems before requirements, requirements before solutions, solutions before design, and design before technology.

- Problem types | Classification, Regression, Clustering, Dimension Reduction, Data Visualizations & Techniques
- Target
- Models | Random Forest | Optional

In [3]:
# Problem | {Problem Type, Target Variable}
def df_objective (problem_type, target_variable):

    return {
        'problem_type' : problem_type,
        'target_feature' : target_variable
    }

### Objective | Sample Data

In [4]:
# Titanic
tn_objective = df_objective('classification', 'Survived')

## Data Gathering - Tabular Data

**File Data Types**: json, csv, xlsx, xml, dataframe,

**API**: url link, http, ftp

**Category**: Web page, Image, Audio, Video

In [5]:
# File Characteristics
def file_descr (file_path): 
    descr = {
        'FileType': os.path.splitext(file_path)[1],
        'FileSize': os.path.getsize(file_path),
    }
    return descr

In [6]:
# Converters | File type to Dataframe

# Covert File to Dataframe
def df_convert (file_path, file_descr):

    # JSON
    if (file_descr['FileType'] == '.json'):

        # Open File and Convert it to JSON Object
        with open(file_path, 'r') as file:
            data = ijson.items(file, 'item')
            json_object = []
            count = 0
            print(data)
            for line in data:
                json_object.append(line)
                count = count + 1
                if(count == 1000): break
        return pd.DataFrame(json_object)

    # CSV
    elif (file_descr['FileType'] == '.csv'):

        # Check whether there is a header | TODO
        return pd.read_csv(file_path)
    else: 
        return False

In [7]:
## Gather Data
def df_gather(fp):

    # File Characteristics
    file_characteristics = file_descr(fp)

    # Dataframe Initialization 
    df = df_convert(fp, file_characteristics)

    return df

### Data Gathering | Sample Data

In [None]:
# Wasabi
df_ws = df_gather('data/sample_data/Wasabi/batterytesters_dataset.json')

In [None]:
# House Prices
df_hp = df_gather('data/sample_data/house_price_data/train.csv')

In [None]:
# Credit Card Fraud
df_cc = df_gather('data/sample_data/credit_card_fraud/creditcard.csv')

In [None]:
# Modcloth | TOCHECK
file_path_md = 'data/sample_data/modcloth/modcloth_final_data.json'
df_md = pd.read_json(file_path_md, lines=True)

In [8]:
# Titanic 
df_tn = df_gather('data/sample_data/titanic/train.csv')

## Data Profiling

### Dataset
- Shape

In [9]:
# Features, Rows
def df_shape(df):
    return {
        'features': df.shape[1],
        'rows': df.shape[0]
    }

### Features
- Role | ID, Input (Indpendant), Target (Dependant)
- Data Type | Integer, Float, Boolean, Categorical, Complex, DateTime, Object, String
- Feature Type | Categorical (Ordinal, Binary), Numerical
- Level | Nominal, Interval, Ordinal, Binary

In [10]:
# Data Type | Standard Set | Integer,  Float, Boolean, Categorical, Complex, DateTime, Object, String
typeset = StandardSet()
def f_data_type(f):
    return str((detect_type(f, typeset)))

In [11]:
# Feature Type | Categorical, Numerical, Alphanumerical | {Data Type, Unique values, Thres}
def f_feature_type(data_type, unique_values, threshold=10): 

    if ((data_type == 'String' or 
         data_type == 'Integer' or 
         data_type == 'Float') and unique_values < threshold):
        return 'Categorical'
    elif (data_type == 'Int' or data_type == 'Float'):
        return 'Numerical'
    else: 
        return 'Alphanumerical'

In [12]:
# Qualitative Characteristics | Nominal, Interval, Ordinal, Binary
def f_qual(data_type):
    if (data_type == 'String'):
        return 'Nominal'
    elif (data_type == 'Int' or data_type == 'Float'):
        return 

In [13]:
# Role | Input (Independant), ID (Unique Identifier)
def f_role(f):
    
    f_unique_values = len(f.unique())
    f_total_values = len(f)

    if (f_unique_values / f_total_values) > 0.9:
        return 'id'
    else: 
        return 'input' 

In [14]:
## Feature Profiling 
def f_profile(f):
    
    # Data Type
    data_type = f_data_type(f)

    # Role
    role = f_role(f)

    # Unique Values
    unique_values = len(f.value_counts())

    # Feature Type
    feature_type = f_feature_type(data_type, unique_values, 10)

    return {
        'data_type' : data_type,
        'role' : role,
        'unique_values' : unique_values,
        'feature_type' : feature_type
    }

In [15]:
## All feature profiling
def df_profile(df, f_target):
    df_profile = {}

    # Dataset
    df_profile['dataset'] = df_shape(df)

    # Target Feature
    df_profile['target_feature'] = f_target

    # Features
    df_features = {}
    for f in df.columns: 
        df_features[f] = f_profile(df[f])
    df_profile['features'] = df_features

    return df_profile 

### Feature Profiling | Samples Data

In [None]:
# House Prices
hp_profiling = df_profile(df_hp)

In [None]:
# Modcloth
md_profiling = df_profile(df_md)

In [16]:
# Titanic 
tn_profile = df_profile(df_tn, tn_objective['target_feature'])

## Exploratory Analysis & Visualization | Descriptive Statistics

### Dataset
- Duplicates


In [17]:
## Duplicates | Check if they exist
def df_duplicates(df):
    return {
        'exist' : df.duplicated().any(),
        'sum' : df.duplicated().sum()
    } 

### Univariate Analysis 

- Interval, Ordinal Statistics | Count, Mean,  Std, Min, 25%, 50%, 75%, Max
- Missing Values 
- Outliers
- Histogram
- Box Plot

In [18]:
# Statistics
def f_statistics(f, d_type, f_type):

    f_statistics = {}
    if f_type == 'Numerical' and d_type != 'String':

        descr = f.describe()
        f_statistics = {
            'count' : int(descr['count']),
            'mean' : round(descr['mean'], 2),
            'std' : round(descr['std'], 2),
            'min' : round(descr['min'], 2),
            'max' : round(descr['max'], 2)
        }
    elif f_type == 'Categorical': 
        for i, y in f.value_counts().items():
            f_statistics[i] = y

    return f_statistics

In [19]:
# Missing Values | Return: {Total Missing Values, Percentage}
def f_missing_data(f):

    ## Missing Values
    missing_values = f.isnull().sum()

    ## Percentage
    percentage = round((missing_values / len(f)), 2)
    return {
        'missing_values': missing_values,
        'percentage': percentage
    }

In [None]:
# Outliers | Univariate, Bivariate
def f_outliers (f):

In [20]:
## Univariate | Feature | {Dataframe_Feature, Data Type, Feature Type}
def f_univariate(f, d_type, f_type): 
    f_univariate = {}

    # Statistics
    f_univariate['statistics'] = f_statistics(f, d_type, f_type)

    # Missing Values
    f_univariate['missing_data'] = f_missing_data(f)

    return f_univariate

In [21]:
# Univariate | Dataframe | {Dataframe, Dataframe_Profiling}
def df_univariate(df, df_prof):

    df_univariate = {}
    for f, d in df_prof['features'].items():
        df_univariate[f] = f_univariate(df[f], d['data_type'], d['feature_type'])

    df_univariate['features'] = df_univariate

    return df_univariate

### Bivariate Analysis
 - How each variable correlates to target variable

In [22]:
# Correlation Analysis | Each feature with target variable
def f_corr (f, corr_matrix):
    if f in corr_matrix:
        return round(corr_matrix[f], 2)
    else:
        return ''

In [23]:
# Bivariate Analysis | Dataframe
def df_bivariate(df, df_prof):

    df_bivariate = {}

    # Correlation Analysis
    corr_matrix = df.corr(numeric_only=True)[df_prof['target_feature']]
    for f in df_prof['features']:
        df_bivariate[f] = f_corr(f, corr_matrix)
    
    return df_bivariate

### Multivariate Analysis
- Normality -> Data should look like normal distribution
- Homoscedasticity -> 
- Linearity -> Linear Patterns
- 

## EDA 

In [24]:
# EDA  | {Datframe, Dataframe Profiling}
def df_eda (df, df_prof):

    df_eda = {}

    # Duplicates
    df_eda['duplicates'] = df_duplicates(df)

    # Univariate
    df_eda['univariate'] = df_univariate(df, df_prof)

    # Bivariate
    df_eda['bivariate'] = df_bivariate(df, df_prof)
 
    return  df_eda

### Descriptive Statistics | Sample Data

In [25]:
# Titanic
tn_eda = df_eda(df_tn, tn_profile)

In [29]:
tn_profile

{'dataset': {'features': 12, 'rows': 891},
 'target_feature': 'Survived',
 'features': {'PassengerId': {'data_type': 'Integer',
   'role': 'id',
   'unique_values': 891,
   'feature_type': 'Alphanumerical'},
  'Survived': {'data_type': 'Integer',
   'role': 'input',
   'unique_values': 2,
   'feature_type': 'Categorical'},
  'Pclass': {'data_type': 'Integer',
   'role': 'input',
   'unique_values': 3,
   'feature_type': 'Categorical'},
  'Name': {'data_type': 'String',
   'role': 'id',
   'unique_values': 891,
   'feature_type': 'Alphanumerical'},
  'Sex': {'data_type': 'String',
   'role': 'input',
   'unique_values': 2,
   'feature_type': 'Categorical'},
  'Age': {'data_type': 'Float',
   'role': 'input',
   'unique_values': 88,
   'feature_type': 'Numerical'},
  'SibSp': {'data_type': 'Integer',
   'role': 'input',
   'unique_values': 7,
   'feature_type': 'Categorical'},
  'Parch': {'data_type': 'Integer',
   'role': 'input',
   'unique_values': 7,
   'feature_type': 'Categorical'}

In [26]:
tn_eda

{'duplicates': {'exist': False, 'sum': 0},
 'univariate': {'PassengerId': {'statistics': {},
   'missing_data': {'missing_values': 0, 'percentage': 0.0}},
  'Survived': {'statistics': {0: 549, 1: 342},
   'missing_data': {'missing_values': 0, 'percentage': 0.0}},
  'Pclass': {'statistics': {3: 491, 1: 216, 2: 184},
   'missing_data': {'missing_values': 0, 'percentage': 0.0}},
  'Name': {'statistics': {},
   'missing_data': {'missing_values': 0, 'percentage': 0.0}},
  'Sex': {'statistics': {'male': 577, 'female': 314},
   'missing_data': {'missing_values': 0, 'percentage': 0.0}},
  'Age': {'statistics': {'count': 714,
    'mean': 29.7,
    'std': 14.53,
    'min': 0.42,
    'max': 80.0},
   'missing_data': {'missing_values': 177, 'percentage': 0.2}},
  'SibSp': {'statistics': {0: 608, 1: 209, 2: 28, 4: 18, 3: 16, 8: 7, 5: 5},
   'missing_data': {'missing_values': 0, 'percentage': 0.0}},
  'Parch': {'statistics': {0: 678, 1: 118, 2: 80, 5: 5, 3: 5, 4: 4, 6: 1},
   'missing_data': {'missi

## Data Cleaning & Correcting & Formatting & Completing (Transform)

### Drop Duplicates

In [None]:
# Drop Duplicates
def drop_duplicates(df): 
    try: 
        df.drop_duplicates(inplace=True)
        return True
    except ValueError:
        return ValueError

### Data Imputation ~ Handle Missing Values

Methods: Mean, Median, KNN, Most Frequent Value, Random Numbers between mean & std, Exploit correlated feature(s)

In [30]:
# Handle Missing Values | {Dataframe, Feature, Profile, EDA}

## Check if drop or not
def f_drop_feature(f_eda):

    missing_percentage = f_eda['f_missing_data']['percentage']

    if (missing_percentage > 0.9):
        return True
    else:
        return False


def df_handle_missing_values(df, f, f_profile, f_eda):

    ## Check | If 10% of values is missing drop featureA

    # Remove
    if (f_drop_feature(f_eda)):
        f.drop()  

    # Mean | Numerical
    imputer = SimpleImputer(strategy='mean')

    # Mode | Categorical

    # Correlated Features

### Handle Outliers

In [None]:
# Handle Outliers
def f_handle_outliers(f):

In [None]:
## Imputation Types | Simple, Iterative

## Numeric | Drop, Mean, Median, Mode, KNN, Interpolated, Most Frequent
# df_hp.fillna(df_hp.mean(), inplace=True)

## Categorical | Drop, Mean, Median, Mode, KNN


In [None]:
missing_data(df_hp, feature_hp)

In [None]:
print(duplicates_exists(df_hp))
print(duplicates_exists(df_wasabi))
print(drop_duplicates(df_hp))


## Feature Engineering

- Feature Extraction
- Feature Selection | Either in data-preprocessing, or the classifier

In [None]:
## Feature Selection | Remove features with low or zero variance