## Project Thesis ~ Initial Structure

## Problem Types

 - Classification 
 - Regression
 - Clustering
 - Dimension Reduction
 - Data Visualization & Analysis
 

In [1]:
# Libraries

# Core Libraries
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Helpers
import ijson
from datetime import datetime

# Visions | Data Type Detection
from visions.functional import detect_type
from visions.typesets import StandardSet
import visions

# Warnings
import warnings

# Settings
warnings.filterwarnings('ignore', category=UserWarning)
pd.set_option('display.max_columns', None)

## Utils

In [2]:
# Utils
def has_missing_data(df_descr):
    for f, d in df_descr['features'].items():

        if (d['eda']['missing_data']['percentage'] > 0.0):
            print('yes')
            print(f)

## Define the Problem - Clear Objectives/Requirements
Problems before requirements, requirements before solutions, solutions before design, and design before technology.

- Problem types | Classification, Regression, Clustering, Dimension Reduction, Data Visualizations & Techniques
- Target
- Models | Random Forest | Optional

In [3]:
# Problem | {Problem Type, Target Variable}
def df_objective (problem_type, target_variable):

    return {
        'problem_type' : problem_type,
        'target_feature' : target_variable
    }

## Data Gathering - Tabular Data

**File Data Types**: json, csv, xlsx, xml, dataframe,

**API**: url link, http, ftp

**Category**: Web page, Image, Audio, Video

In [4]:
# File Characteristics
def file_descr (file_path): 
    descr = {
        'FileType': os.path.splitext(file_path)[1],
        'FileSize': os.path.getsize(file_path),
    }
    return descr

# Converters | File type to Dataframe
def df_convert (file_path, file_descr, nrows = 10000):

    # JSON
    if (file_descr['FileType'] == '.json'):

        # Open File and Convert it to JSON Object
        with open(file_path, 'r') as file:
            data = ijson.items(file, 'item')
            json_object = []
            count = 0
            for line in data:
                json_object.append(line)
                count = count + 1
                if(count == 1000): break
        return pd.DataFrame(json_object)

    # CSV
    elif (file_descr['FileType'] == '.csv'):

        # Check whether there is a header | TODO
        return pd.read_csv(file_path, nrows=nrows)
    else: 
        return False
    
# Gather Data
def df_gather(fp):

    # File Characteristics
    file_characteristics = file_descr(fp)

    # Dataframe Initialization 
    df = df_convert(fp, file_characteristics)

    return df

## Data Profiling

### Dataset
- Shape

In [5]:
# Features, Rows
def df_shape(df):
    return {
        'features': df.shape[1],
        'rows': df.shape[0]
    }

### Features
- Role | ID, Input (Indpendant), Target (Dependant)
- Data Type | Integer, Float, Boolean, Categorical, Complex, DateTime, Object, String
- Feature Type | Categorical (Ordinal, Binary), Numerical
- Level | Nominal, Interval, Ordinal, Binary

In [6]:
# Data Type | Standard Set | Integer,  Float, Boolean, Categorical, Complex, DateTime, Object, String
typeset = StandardSet()
def f_data_type(f : pd.DataFrame):

    return str(typeset.infer_type(f.astype(str)))

# Feature Type | Categorical, Numerical, Alphanumerical | {Data Type, Unique values Ratio, Thres}
def f_feature_type(data_type, unique_values_ratio, thres = 0.1): 

    if ((data_type == 'String' or 
         data_type == 'Integer' or 
         data_type == 'Float' or 
         data_type == 'Boolean') and unique_values_ratio < thres):
        return 'Categorical'
    elif (data_type == 'Integer' or data_type == 'Float'):
        return 'Numerical'
    else: 
        return 'Alphanumerical'
    
# Qualitative Characteristics | Nominal, Interval, Ordinal, Binary
def f_qual(data_type):
    if (data_type == 'String'):
        return 'Nominal'
    elif (data_type == 'Int' or data_type == 'Float'):
        return 
    
# Role | Input (Independant), ID (Unique Identifier)
def f_role(f, f_target):
    
    f_unique_values = len(f.unique())
    f_total_values = len(f)

    if (f_unique_values / f_total_values) > 0.9:
        return 'id'
    elif (f.name == f_target): 
        return 'target'
    else:
        return 'input' 
    
## Feature Profiling  | {Feature, Objective, Unique Values Threshold}
def f_profile(f, f_objective, unique_values_thres):
    
    # Data Type
    data_type = f_data_type(f)

    # Role
    role = f_role(f, f_objective['target_feature'])

    # Unique Values Ration
    unique_values = len(f.value_counts()) 
    unique_values_ratio = unique_values / f.count()

    # Feature Type
    feature_type = f_feature_type(data_type, unique_values_ratio, unique_values_thres)

    return {
        'data_type' : data_type,
        'role' : role,
        'unique_values' : unique_values,
        'feature_type' : feature_type
    }

## All feature profiling
def df_profile(df, df_objective, unique_values_ratio):
    df_profile = {}

    # Dataset
    df_profile['dataset'] = df_shape(df)

    # Target Feature
    df_profile['target_feature'] = df_objective['target_feature']

    # Features
    df_features = {}
    for f in df.columns: 
        df_features[f] = f_profile(df[f], df_objective, unique_values_ratio)
    df_profile['features'] = df_features

    return df_profile 

## Exploratory Analysis & Visualization | Descriptive Statistics

### Dataset
- Duplicates


In [7]:
## Duplicates | Check if they exist
def df_duplicates(df):
    return {
        'exist' : df.duplicated().any(),
        'sum' : df.duplicated().sum()
    } 

### Univariate Analysis 

- Interval, Ordinal Statistics | Count, Mean,  Std, Min, 25%, 50%, 75%, Max
- Missing Values 
- Outliers
- Histogram
- Box Plot

In [8]:
# Statistics
def f_statistics(f, d_type, f_type):

    f_statistics = {}
    if f_type == 'Numerical' and d_type != 'String':

        descr = f.describe()
        f_statistics = {
            'count' : int(descr['count']),
            'mean' : round(descr['mean'], 2),
            'std' : round(descr['std'], 2),
            'min' : round(descr['min'], 2),
            'max' : round(descr['max'], 2)
        }
    elif f_type == 'Categorical': 
        for i, y in f.value_counts().items():
            
            # Calculate Frequency of each categorical value
            freq = round(y / len(f), 2)
            f_statistics[i] = {
                'value': y,
                'frequency': freq
            }

    return f_statistics

# Missing Values | Return: {Total Missing Values, Percentage}
def f_missing_data(f):

    # Null Values
    null_values = f.isnull().sum() 

    # Empty Values 
    empty_values = f.isin(['']).sum()

    # Missing Values
    missing_values = null_values + empty_values

    ## Percentage
    percentage = round((missing_values / len(f)), 2)
    return {
        'missing_values': missing_values,
        'percentage': percentage
    }

## Univariate | Feature | {Dataframe_Feature, Data Type, Feature Type}
def f_univariate(f, d_type, f_type): 
    f_univariate = {}

    # Statistics
    f_univariate['statistics'] = f_statistics(f, d_type, f_type)

    # Missing Values
    f_univariate['missing_data'] = f_missing_data(f)

    return f_univariate

# Univariate | Dataframe | {Dataframe, Dataframe_Profiling}
def df_univariate(df, df_prof):

    df_univariate = {}
    for f, d in df_prof['features'].items():
        df_univariate[f] = f_univariate(df[f], d['data_type'], d['feature_type'])

    df_univariate['features'] = df_univariate

    return df_univariate

### Bivariate Analysis
 - How each variable correlates to target variable

In [9]:
# Correlation Analysis | Each feature with target variable
def f_corr (f, corr_matrix):
    if f in corr_matrix:
        return round(corr_matrix[f], 2)
    else:
        return ''
    
# Bivariate Analysis | Dataframe
def df_bivariate(df, df_prof):

    # Check if target feature is in String format | Label Encoding
    if (df_prof['features'][df_prof['target_feature']]['data_type'] == 'String'):

        label_encoder = LabelEncoder()
        df[df_prof['target_feature']] = label_encoder.fit_transform(df[df_prof['target_feature']])

    df_bivariate = {}

    # Correlation Analysis
    corr_matrix = df.corr(numeric_only=True)[df_prof['target_feature']]
    for f in df_prof['features']:
        df_bivariate[f] = f_corr(f, corr_matrix)
    
    return df_bivariate

### Multivariate Analysis
- Normality -> Data should look like normal distribution
- Homoscedasticity -> 
- Linearity -> Linear Patterns

## EDA 

In [10]:
# EDA  | {Datframe, Dataframe Profiling}
def df_eda (df, df_prof):

    df_eda = {}

    # Duplicates
    df_eda['duplicates'] = df_duplicates(df)

    # Univariate
    df_eda['univariate'] = df_univariate(df, df_prof)

    # Bivariate
    df_eda['bivariate'] = df_bivariate(df, df_prof)
 
    return  df_eda

## Data Description
Dataset description in one dictionary

In [11]:
# Dataset Description | {Dataframe, Objective, Unique Values Ratio}
def df_describe(df, df_objective, unique_values_ratio):

    # Profile 
    df_prof = df_profile(df, df_objective, unique_values_ratio)

    # Statistics
    df_stat = df_eda(df, df_prof)

    # Duplicates
    df_prof['dataset']['duplicates'] = df_stat['duplicates']

    # Univariate
    for f, i in df_prof['features'].items():
        df_prof['features'][f]['eda'] = df_stat['univariate'][f]

    return df_prof

## Data Cleaning & Correcting & Formatting & Completing (Transform)

### Drop Duplicates

### Data Cleaning 
- Data Imputation ~ Handle Missing Values ~ Methods: Mean, Median, KNN, Most Frequent Value, Random Numbers between mean & std, Exploit correlated feature(s)
- Data Anomaly Detection ~ Handle Outliers ~ Interquartile Range (IQR) method 

In [12]:
# Drop Duplicates
def df_drop_duplicates(df):

    return df.drop_duplicates()

# Drop Features | IDs, Missing Values > Thres 
def df_drop(df, df_descr, drop_perc = 0.9): 

    # Drop Features with missing percentage > thres
    for f, details in df_descr['features'].items():

        # If missing values are above threshold | Drop
        if (details['eda']['missing_data']['percentage'] >= drop_perc):
            df = df.drop(columns=[f])
    
    # Drop IDs
    # columns = []
    # for f, details in df_descr['features'].items():
    #     if (details['role'] == 'id'):
    #         columns.append(f)
    # df = df.drop(columns=columns)

    return df

# Data Imputation | {Dataframe, Profile, EDA}
def df_impute(df: pd.DataFrame, df_descr: dict):

    # Split Categorical and Numerical Columns
    numerical_columns = []
    categorical_columns = []
    alphanumerical_columns = []

    # For each feature
    for f, p in df_descr['features'].items():

        # Split | If missing percentage is above 0
        if (p['eda']['missing_data']['missing_values'] > 0.0):
            if (p['feature_type'] == 'Numerical'):
                numerical_columns.append(f)
            elif (p['feature_type'] == 'Categorical'):
                categorical_columns.append(f)
            elif (p['feature_type'] == 'Alphanumerical'):
                alphanumerical_columns.append(f)

    # Imputers
    numerical_imputer = SimpleImputer(strategy='mean')
    categorical_imputer = SimpleImputer(strategy='most_frequent')

    # Mean | Numerical
    if (len(numerical_columns) > 0):
        df[numerical_columns] = numerical_imputer.fit_transform(df[numerical_columns])

    # Mode | Categorical
    if (len(categorical_columns) > 0):
        df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])

    # Empty Space | Alphanumerical
    if (len(alphanumerical_columns) > 0):
        df[alphanumerical_columns] = df[alphanumerical_columns].fillna('')
    
    df = df.fillna('')

    return df

# Handle Outliers | {Dataframe, Profile, EDA, Threshold}
def df_handle_outliers(df : pd.DataFrame, df_descr : dict,  iqr_thres : float = 1.5):

    # Numerical Columns
    numerical_columns = []

    for f, p in df_descr['features'].items():
        if (p['feature_type'] == 'Numerical'):
            numerical_columns.append(f)

    for f in numerical_columns: 
        Q1 = df[f].quantile(0.25)
        Q3 = df[f].quantile(0.75)

        IQR = Q3 - Q1 
        outliers = ((df[f] < (Q1 - iqr_thres * IQR)) | (df[f] > (Q3 + iqr_thres * IQR)))
        df = df[~outliers]
    
    return df

# Dataframe Clean | {Dataframe, Description, MV Thres, Outlier Thres, Unique Value Thres}
def df_clean (df, df_descr, df_objective, mv_thres = 0.9, outlier_thres = 1.5, uv_thres = 0.1):

    # Drop Duplicates
    df_duplic = df_drop_duplicates(df)

    # Drop Features
    df_dropped = df_drop(df_duplic, df_descr, mv_thres)

    ## Describe
    df_descr_dropped = df_describe(df_dropped, df_objective, uv_thres)

    # Data Imputation | 0.5 %
    df_imputed = df_impute(df_dropped, df_descr_dropped)

    ## Describe
    df_descr_imputed = df_describe(df_imputed, df_objective, uv_thres)

    # Drop Outliers | 1.5
    # df_cleaned = df_handle_outliers(df_imputed, df_descr_imputed, outlier_thres)

    # return df_cleaned
    return df_imputed

## Feature Engineering

- Feature Extraction
- Feature Encoding 
- Feature Scaling
- Feature Selection | Either in data-preprocessing, or the classifier

In [13]:
## Feature Extraction | {Series, Extraction Type, Frequency Threshold}
def f_extract(f, type = 'TFID', freq_thres = 0.1):
    tfidf_vectorizer = TfidfVectorizer()

    x_tfidf = tfidf_vectorizer.fit_transform(f.values.astype('U'))

    f_extracted = pd.DataFrame(x_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    # Keep specific columns
    term_document_frequency = (f_extracted > 0).sum(axis=0)
    term_document_frequency_ratio = term_document_frequency / f.count()

    columns_to_keep = term_document_frequency[term_document_frequency_ratio >= freq_thres].index

    f_extracted_filtered = f_extracted[columns_to_keep]

    # Change Names of encoded df
    for c in f_extracted_filtered.columns: 
        name = str(f.name) + '_' + str(c)
        f_extracted_filtered = f_extracted_filtered.rename(columns={c : name})
    
    return f_extracted_filtered

In [14]:
## Feature Encoding | {Feature, Encoding Type}
def f_encode(f, f_type, f_data_type, encode_type = 'one-hot'):

    # DateTime
    if (f_data_type  == 'DateTime'):
        f_encoded = pd.DataFrame()

        # DateTime Encode
        f = pd.to_datetime(f)
        print(f.dt)
        f_encoded['year'] = f.dt.year
        f_encoded['month'] = f.dt.month
        f_encoded['day'] = f.dt.day

    # Categorical
    if (f_type == 'Categorical' and encode_type == 'one-hot'):

        f_encoded = pd.get_dummies(f)
        f_encoded = f_encoded.astype(int)
    
    # Change Names of encoded df
    for c in f_encoded.columns: 
        name = str(f.name) + '_' + str(c)
        f_encoded = f_encoded.rename(columns={c : name})

    return f_encoded 

In [15]:
## Feature Selection | Remove features with low or zero variance

In [16]:

## Feature Scaling
# def f_scaling(f, scaling_type):

In [17]:
# Feature Engineering | {Dataframe, Profile, EDA}
def df_feature_eng(df, df_descr): 

    # Feature Extraction
    col_remove = []
    for f, details in df_descr['features'].items():

        # Extract Text Features
        if (details['feature_type'] == 'Alphanumerical' and
            details['role'] != 'target'):

            f_extracted = f_extract(df[f])

            # Concat to existing df
            df = df.drop(columns=[f])
            df = pd.concat([df, f_extracted], axis=1)

            # Append
            col_remove.append(f)

    # Remove from description
    for i in col_remove:
        del df_descr['features'][i]
        
    # Feature Encoding
    for f, details in df_descr['features'].items():
        if ((details['feature_type'] == 'Categorical' or details['data_type'] == 'DateTime') and 
            details['role'] != 'id' and 
            details['role'] != 'target' and 
            details['data_type'] != 'Integer' and
            details['data_type'] != 'Float'):

            # Encode
            if( f=='YearBuilt'):
                print(df[f])
            f_encoded = f_encode(df[f], details['feature_type'], details['data_type'], 'one-hot')

            # Concat to existing df
            df = df.drop(columns=[f])
            df = pd.concat([df, f_encoded], axis=1)

    # Feature Selection 
    return df

## Automated Data Preprocessing

In [18]:
# Automated Data Preprocessing | {File Source, Problem Type, Target}
def auto_preproc (file_source, problem_type, target, params = {}):

    # Gather
    df = df_gather(file_source)

    # Objective
    df_object = df_objective(problem_type, target)

    # Describe
    df_descr = df_describe(df, df_object, 0.1)

    # Clean
    df_cleaned = df_clean(df, df_descr, df_object, 0.7, 20)

    # Describe | Cleaned
    df_descr_cleaned = df_describe(df_cleaned, df_object, 0.1)

    # Feature Engineer
    df_f_eng = df_feature_eng(df_cleaned, df_descr_cleaned)

    # Describe | Feature Engineered
    df_descr_f_eng = df_describe(df_f_eng, df_object, 0.1)

    return {
        'df' : df,
        'descr' : df_descr, 
        'cleaned' : df_cleaned, 
        'descr_cleaned' : df_descr_cleaned, 
        'f_eng' : df_f_eng, 
        'descr_f_eng' : df_descr_f_eng 
    }

## Real Data Testing

In [26]:
# Titanic
tn = auto_preproc('data/sample_data/titanic/train.csv', 'classification', 'Survived')

# House Prices
hp = auto_preproc('data/sample_data/house_price_data/train.csv', 'classification', 'SalePrice')

# Wasabi
# ws = auto_preproc('data/sample_data/Wasabi/batterytesters_dataset.json', 'classification', 'commonalarm')

# Credit Fraud
# cc = auto_preproc('data/sample_data/credit_card_fraud/creditcard.csv', 'classification', 'Class')

# Modcloth | TOCHECK
# file_path_md = 'data/sample_data/modcloth/modcloth_final_data.json'
# df_md = pd.read_json(file_path_md, lines=True)

# Rain Australia
# ra = auto_preproc('data/sample_data/rainAustralia/weatherAUS.csv', 'classification', 'RainTomorrow')

## Model Testing

<!-- S -->
train, test = spli

In [20]:
def logistic_regression(df, target):

    # Split Train, Test
    X = df.drop(columns=[target])
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Model
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)

    # Predict 
    y_pred = model.predict(X_test)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Validation Accuracy: {accuracy:.2f}')

In [39]:
logistic_regression(tn['f_eng'], 'Survived')

Validation Accuracy: 0.79


In [38]:
tn['f_eng']

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Name_miss,Name_mr,Name_mrs,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.000000,1,0,7.2500,0.000000,0.136235,0.000000,0,1,0,0,1
1,2,1,1,38.000000,1,0,71.2833,0.000000,0.000000,0.187172,1,0,1,0,0
2,3,1,3,26.000000,0,0,7.9250,0.249214,0.000000,0.000000,1,0,0,0,1
3,4,1,1,35.000000,1,0,53.1000,0.000000,0.000000,0.175073,1,0,0,0,1
4,5,0,3,35.000000,0,0,8.0500,0.000000,0.174136,0.000000,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.000000,0,0,13.0000,0.000000,0.000000,0.000000,0,1,0,0,1
887,888,1,1,19.000000,0,0,30.0000,0.247206,0.000000,0.000000,1,0,0,0,1
888,889,0,3,29.699118,1,2,23.4500,0.198911,0.000000,0.000000,1,0,0,0,1
889,890,1,1,26.000000,0,0,30.0000,0.000000,0.133492,0.000000,0,1,1,0,0
