In [14]:
import pandas as pd
import numpy as np

import os, sys
# Adding parent directory to python path
sys.path.append(os.path.dirname(os.getcwd()))

from ml_workflow import mlwf_rule, mlwf_data_source
import ml_workflow

import sqlite3

DB_FILE_NAME = 'data/extra_data.db'

In [3]:
os.remove(DB_FILE_NAME)

Execute this cell only if the data/extra_data.db doesn't exist

In [4]:
import random

conn = sqlite3.connect(DB_FILE_NAME)

conn.execute("""
CREATE TABLE IF NOT EXISTS passenger_hair_color (
    PassengerId int PRIMARY KEY,
    HairColor int NOT NULL
);
""")

for i in range(1, 892):
    conn.execute(
        "INSERT INTO passenger_hair_color(PassengerId, HairColor) VALUES(?, ?);",
        (i, random.randint(0, 3))
    )
    conn.commit()

conn.execute("""
CREATE TABLE IF NOT EXISTS passenger_is_leonardo_di_caprio (
    PassengerId int PRIMARY KEY,
    IsLeonardoDiCaprio boolean NOT NULL
);
""")

conn.execute("INSERT INTO passenger_is_leonardo_di_caprio(PassengerId, IsLeonardoDiCaprio) \
              SELECT PassengerId,  PassengerId = 852 FROM passenger_hair_color;");
conn.commit();

In [None]:
sqlite3.connect(DB_FILE_NAME).execute("SELECT PassengerId, HairColor FROM passenger_hair_color;").fetchall()

In [15]:
# ml_workflow.pandas_link.override_pandas_functions()

conn = sqlite3.connect(DB_FILE_NAME)

@mlwf_data_source(source_type = 'file', source = 'data/train.csv')
def read_input_file():
    return pd.read_csv('data/train.csv')

@mlwf_data_source(source_type = 'db', source = 'extra_data')
def read_hair_data():    
    return pd.read_sql("SELECT PassengerId, HairColor \
                        FROM passenger_hair_color;",
                       conn
                      ) 

@mlwf_rule
def remove_name(data_set):
    data_set.drop('Name', inplace=True, axis = 1)

@mlwf_rule    
def manage_age(data_set):
    data_set['NoAge'] = data_set['Age'].isna().astype(int)
    data_set.loc[data_set['Age'].isna(), 'Age'] = data_set['Age'].mean()

@mlwf_rule(name = "Est blonde")
def is_blond(data_set):
    data_set["IsBlond"] = (data_set["HairColor"] == 3).astype(int)
    
@mlwf_rule(name = "Appliquer une regression logistique")
def apply_logistic_regression(data_set):
    data_set['Proba'] = np.random.random(len(data_set))
    
data_set = read_input_file()

remove_name(data_set)
manage_age(data_set)

data_set['IsFemale'] = (data_set['Sex'] == 'female').astype(int)

hair_data = read_hair_data()
is_blond(hair_data)

data_set = data_set.merge(hair_data)

apply_logistic_regression(data_set)

In [None]:
data_set.current_workflow.plot()

In [None]:
workflow.to_json('workflow.json')
workflow.to_html('workflow.html')

In [10]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/train.csv')

data_in_line = [list(row.values) for _, row in data.iterrows()]
print(data_in_line[0])
def process_pd_inline():
    res = []
    for _, row in data.iterrows():
        res.append(row['Sex'])
    return res

def process_list_inline():
    res = []
    for row in data_in_line:
        res.append(row[4])
    return res


assert(process_pd_inline() == process_list_inline())

[1, 0, 3, 'Braund, Mr. Owen Harris', 'male', 22.0, 1, 0, 'A/5 21171', 7.25, nan, 'S']


In [11]:
%timeit process_pd_inline()

55 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
%timeit process_list_inline()

47.6 µs ± 500 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [13]:
%timeit data_in_line = [list(row.values) for _, row in data.iterrows()]

49.2 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
True & True

True

In [15]:
True & False

False

In [16]:
False & False

False

In [21]:
True & False and True

False