In [8]:
#!/usr/bin/env python3
#-*- coding: utf8 -*-


"""
01-first-naive-model.py
"""


"""
in this second part, we will implement our first logistic regression model.
We will first implement by hand a naive classifier, then a dummy classifier 
(who does the same job), and finally a basic logistic regression model.
rather than looking at the results of a regression we will implement a 
function that will test the model x times and that will average the results
 obtained
we will then implement a results manager that will be a dataframe
"""



'\nfind here a first study of the dataset, in which we seek to understand and\ngive meaning to the dataset.\nwe are not trying to solve our problem but will focus on visualization,\nclenaning and feature engineering.\nat first we will just study the corelations, the links, the quality and the\nmeaning of our dataset.\nexternal research and more general considerations may be included in this work\n'

In [9]:
# import

# from sklearn.preprocessing import *
from sklearn.model_selection import train_test_split
# from sklearn.grid_search import *

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, log_loss


In [71]:
# pasting first_tour.ipynb if nedeed


###############################################################
###############################################################

# -------------------------------------------------------------

# please see first_tour.ipynb before

# -------------------------------------------------------------

###############################################################
###############################################################


# import
import os, sys, logging, random
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# logging 
l = logging.INFO
logging.basicConfig(level=l, format="%(levelname)s : %(message)s")
info = logging.info

# graph
# %matplotlib
# sns.set()

# consts
FOLDER      = "Driven-Data-Blood-Donations"
TRAIN_FILE  = "training_data.csv"
TEST_FILE   = "test_data.csv"

# functions

def finding_master_path(folder="data") :
    path = os.getcwd()
    path = path.split("/")
    idx  = path.index(FOLDER)
    path = path[:idx+1]
    folder = str(folder) + "/"
    path.append(folder)  
    path = "/".join(path)
    if not os.path.isdir(path) : 
        raise NotADirectoryError
    return path
    
def return_datasets(path) : 
    li = [i for i in os.listdir(path) if ".csv" in i ]
    return li 

def build_df(path, file) : 
    df          = pd.read_csv(path+file, index_col=0)
    df.columns  = pd.Index( ["last_don", "num_don","vol_don", "first_don", 
                            "target"], dtype="object")
    return df

def print_df(df) : 
    print(df.ndim)
    print(df.shape)
    print(df.dtypes)
    print(df.index)
    print(df.columns)
    print(df.describe())
    print(df.head(3))
    print(df.tail(3))

def re_dtype(df) : 
    # li = [np.uint8, np.uint16]
    # [print(i,  np.iinfo(i).min, np.iinfo(i).max) for i in li]
    dtypes_dict = {     "last_don"  : np.uint8, 
                        "num_don"   : np.uint8,
                        "vol_don"   : np.uint16, 
                        "first_don" : np.uint8, 
                        "target"    : np.uint8       }
    df = df.astype(dtypes_dict)
    return df 

def graph_each_feature(df)  : 
    features = [i for i in df.columns if "target" not in i] 
    fig, _axes = plt.subplots(2, 2, figsize=(13,13))
    axes = _axes.flatten()
    info(fig)
    info(axes)
    info(len(axes))
    for i, feat in enumerate(features) :
        info(i, feat)
        # -----------------------------------------
        # sns.distplot --> (kde=True ) ???
        # -----------------------------------------
        axes[i].hist(df[feat], bins=30)
        axes[i].set_title(feat)
    plt.suptitle("features distribution")
    plt.show()

def graph_corr_matrix(df) : 
    corr_mat = df.corr()
    sns.heatmap(corr_mat, cmap="coolwarm", annot=True, fmt='.3g')
    plt.title("correlation matrix")
    plt.show()

def drop_corr_features(df) : 
    df = df.drop("vol_don", axis=1)
    return df 

def study_nas(df) : 
    print(df.isna().any())
    print(df.isna().any())

def study_outliers(df, k=1.5) : 
    fig, _axes = plt.subplots(1, 5, figsize=(13,13))
    axes = _axes.flatten()
    info(fig)
    info(axes)
    info(len(axes))
    for i, feat in enumerate(df.columns) :
        info(i, feat)
        axes[i].boxplot(df[feat], whis=k)
        axes[i].set_title(feat)
    plt.suptitle("features outliers, k of {}".format(whis))
    plt.show()

def return_outliers(ser, k) : 
    desc = ser.describe()
    q1, q3, q2 = desc["25%"], desc["75%"], desc["50%"]
    IQ = q3-q1
    range_min, range_max = q1 - k * IQ, q3 + k*IQ
    # outliers = ser[(ser > range_max) or (ser < range_min)] 
    return ser >= range_max

def delete_outliers(df, k) : 
    li = [i for i in df.columns if "target" not in i]
    for feat in li : 
        df = df[return_outliers(df[feat], k) == False]
    return df

def first_tour(folder="data", file=TRAIN_FILE) : 
    # build data path
    path = finding_master_path(folder)
    # info(path)							# UNCOMMENT IF NEEDED
    # just show dataset list
    # datasets = return_datasets(path)      # UNCOMMENT IF NEEDED
    # info(datasets)                        # UNCOMMENT IF NEEDED
    # build our df
    df = build_df(path, file)
    # print main info
    # print_df(df)                          # UNCOMMENT IF NEEDED
    # (overkilled) recast dataframe in a better dtype
    df = re_dtype(df)
    # graph features distr and correlation  # UNCOMMENT IF NEEDED
    # graph_each_feature(df)                  
    # graph_corr_matrix(df)                 # UNCOMMENT IF NEEDED
    # drop corr values
    df = drop_corr_features(df)
    # nas
    # study_nas(df)                         # UNCOMMENT IF NEEDED
    # for i in [1.5, 2, 2.5, 3] :           # UNCOMMENT IF NEEDED
    # study_outliers(df, i)                 # UNCOMMENT IF NEEDED
    # df = delete_outliers(df, 3)           # UNCOMMENT IF NEEDED
    return df

In [73]:
# consts 

# COLUMNS = ["naive", "dummy", "basic", "features eng."]
# MODELS = [naive_model, dummy_model, basic_model]

In [23]:
# split our features from our target

def return_X_y(df) : 
    
    X = df.drop("target", axis=1)
    y = df.target

    return X, y  


# X,y = return_X_y(df)


In [72]:
# build from scratch a naive/dummy model which make prediction regarding global target probabilities

def naive_model(df=None) : 
	""" """
	if not isinstance(df, pd.DataFrame): 
		df = first_tour()

	X,y = return_X_y(df)
	t = split(X,y)

	X_train, X_test, y_train, y_test = t 

	freq = y_test.value_counts() / len(y_test)
		
	y_pred = np.random.binomial(1, freq[1], len(y_test))
	y_pred = pd.Series(y_pred)

return accuracy_score(y_test, y_pred).round(3)

####

# naive_model()


ValueError: 'Driven-Data-Blood-Donations' is not in list

In [25]:
# split test and train df/target

def split(X,y) : 

	func = train_test_split
	tup = train_test_split(X, y)
	
	return tup

####

# tup = split(X,y)


In [47]:
# rather than conding a dummy model from scratch, use sk learn DummyClassifier (same job) 

def dummy_model(df=None) : 

	if not isinstance(df, pd.DataFrame): 
		df = first_tour()
	
	X,y = return_X_y(df)
	t = split(X,y)


	X_train, X_test, y_train, y_test = t 

	model = DummyClassifier()
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	return accuracy_score(y_test, y_pred).round(3)

####

# dummy_model()


Unnamed: 0,last_don,num_don,vol_don,first_don,target
619,2,50,12500,98,1
664,0,13,3250,28,1
441,1,16,4000,35,1
160,2,20,5000,45,1
358,1,24,6000,77,0
335,4,4,1000,4,0
47,2,7,1750,14,1
164,1,12,3000,35,0
736,5,46,11500,98,1
436,0,3,750,4,0


In [48]:
# just for fun trying to make predictions with a very basic model (no meta params, no features engineering)
# this one will be our model prediction base
# it is suposed to be better that our DummyClassifier. If not there is a major issue...

def basic_model(df=None) : 
	""" """

	if not isinstance(df, pd.DataFrame): 
		df = first_tour()
	
	X,y = return_X_y(df)
	t = split(X,y)


	X_train, X_test, y_train, y_test = t 

	model = LogisticRegression()
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	return accuracy_score(y_test, y_pred).round(3)

####

# basic_model()


2
(576, 5)
last_don     int64
num_don      int64
vol_don      int64
first_don    int64
target       int64
dtype: object
Int64Index([619, 664, 441, 160, 358, 335,  47, 164, 736, 436,
            ...
            361,  30, 337, 496, 169, 698, 433, 360, 541,  74],
           dtype='int64', length=576)
Index(['last_don', 'num_don', 'vol_don', 'first_don', 'target'], dtype='object')
         last_don     num_don       vol_don   first_don      target
count  576.000000  576.000000    576.000000  576.000000  576.000000
mean     9.439236    5.427083   1356.770833   34.050347    0.239583
std      8.175454    5.740010   1435.002556   24.227672    0.427200
min      0.000000    1.000000    250.000000    2.000000    0.000000
25%      2.000000    2.000000    500.000000   16.000000    0.000000
50%      7.000000    4.000000   1000.000000   28.000000    0.000000
75%     14.000000    7.000000   1750.000000   49.250000    0.000000
max     74.000000   50.000000  12500.000000   98.000000    1.000000
     las

In [39]:
# we now need to have a sort of decorator which will be charged to lunch n times our model and to give 
# us the accuracy mean of n trials

def model_accuracy_mean(model, nb=5, df=None) : 
	""" """

	scores = [model(df) for i in range(nb)]

	info(type(scores))
	info(type(range(nb)))

	score = sum(scores)/len(scores)

    return score.round(3)

####

# model_accuracy_mean(naive_model)
# model_accuracy_mean(dummy_model)
# model_accuracy_mean(baisc_model)

'/home/alex'

In [74]:
# we now will be able to build a specific dataframe to handle results of various tested models

COLUMNS = ["naive", "dummy", "basic", "features eng."]
MODELS =  [naive_model, dummy_model, basic_model]

results = pd.DataFrame(columns=COLUMNS)

####

# results


NameError: name 'naive_model' is not defined

In [56]:
# and for each feature engineering configuration, we will have a function charged to run every 
# listed models and to add properly the results in our specific dataframe


def add_new_results(feat_com=None,
                    results=None, 
                    n=5, 
					models=MODELS, 
                    columns= COLUMNS, 
					df=None) : 

	
	if not isinstance(results, pd.DataFrame) : 
		results = pd.DataFrame(columns=columns)

	new = [model_accuracy_mean(i, n, df) for i in models]
	info(new)

	if not feat_com : 
		feat_com = "No comment"

	new.append(feat_com)
	info(new)
	
	new = pd.Series(new, index=columns)
	info(new)
	
	results = results.append(new, ignore_index=True)
	info(results)

return results

####

# results = add_new_results("test")
# results


SyntaxError: 'return' outside function (<ipython-input-56-f10a46a9066a>, line 7)

In [58]:
# finally just to test this "meta" model we will test it with our first featue engineering 
# possibility : outilers threshold
# we do not care so much about the results but about the global process of our meta model
# of course if we can find a first way for our feature engineering work, it could be great! 

def first_approch_of_feat_eng(	drop_list,
								results=None,
								n=5, 
								models=MODELS, columns=COLUMNS, 
								df=None) : 
	
	if not isinstance(drop_list, list) : 
		raise TypeError

	if not isinstance(results, pd.DataFrame) : 
		results = pd.DataFrame(columns=columns)

	for i in drop_list : 

		df = first_tour()
		df = delete_outliers(df, i) 

		feat_com = "drop outliers with threshold of k > " + str(i)

		results = add_new_results(	results=results,
									feat_com=feat_com,
									n=n, 
									models=models, 
									columns=columns, 
									df=df)


	return results

####

# results = first_approch_of_feat_eng([1.5, 2, 2.5, 3])
# results


last_don     False
num_don      False
vol_don      False
first_don    False
target       False
dtype: bool
last_don     False
num_don      False
vol_don      False
first_don    False
target       False
dtype: bool


In [75]:
# just for ther record, let's build a function which resume all this work in 3 ligns

def first_naive_model() : 

	results = pd.DataFrame(columns=COLUMNS)
	results = add_new_results(results, "without_any_feat_eng")

	results = first_approch_of_feat_eng([1.5, 2.0, 2.5, 3.0, 3.5])
	
return results

####

# first_naive_model()


SyntaxError: 'return' outside function (<ipython-input-75-dc113ee76009>, line 10)