In [3]:
import logging

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

# Initialize logger
logger = logging.getLogger(__name__)

def get_X_y(df):
    """
    :param df: DataFrame to extract X and y. y has to be in the last column
    :return: X and y
    """
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    return X, y

def create_k_fold(X, y):
    """

    :param X: Feature matrix
    :param y: Label Vector
    :return: KFold Object
    """
    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    kf.get_n_splits(X, y)
    return kf

def execute_k_fold(kf, X, y):
    """
    This function trains the model on each split and prints the f1 score.
    :param kf: KFold Object
    :return: void
    """
    count_iter = 0

    for train_index, test_index in kf.split(X, y):
        count_iter += 1
        logger.info('Split Nr. {}'.format(count_iter))
        logger.debug("Train size: {}, Test size: {}".format(len(train_index), len(test_index)))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        rf = RandomForestClassifier(n_estimators=10)
        rf.fit(X_train, y_train)
        y_preds = rf.predict(X_test)
        logger.info("F1 Score at split {} is {:.2f} %".format(count_iter, (f1_score(y_test, y_preds, average='weighted')*100)))
        logger.info("====================================================================")

def main(target_col):
    """
    :param column: Which column to take as a label
    """
    logger.info("Start.")
    logger.info("Read in df")
    df = pd.read_pickle('../data/processed/multiclass/task2_df_example.pickle')
    
    label_matrix = pd.read_pickle('../data/interim/label_matrix.pickle')
    
    logger.info("Join label_matrix and df")
    df_w_labels = pd.merge(df, label_matrix, on='pid')

    logger.info("Read in features")
    features = pd.read_pickle('../data/processed/features/task_2.pickle')
    # Add target column to features 
    features = features + [target_col]

    logger.info("Get x and y")
    X, y = get_X_y(df_w_labels[features])

    logger.info("Create KFold object")
    kf = create_k_fold(X, y)

    logger.info("Execute and train model")
    execute_k_fold(kf, X, y)



In [32]:
['1', '2']+ ['3']

['1', '2', '3']

In [26]:
df = pd.read_pickle('../data/interim/task2/df_pp.pickle')

In [27]:
df.shape

(336021, 195)

In [28]:
label_matrix = pd.read_pickle('../data/interim/label_matrix.pickle')

In [29]:
label_matrix.head()

Unnamed: 0,pid,kmeans_4
0,196356,2
1,204083,1
2,170667,1
3,115511,2
4,129719,0


Unnamed: 0,sid,click_time,click_mode,pid,req_time,o_long,o_lat,d_long,d_lat,distance_query,...,min_temp,weather,wind,weather_dy,weather_dyq,weather_q,weather_qdy,weather_xq,weather_xydy,kmeans_4
0,753082.0,2018-10-01 00:00:11,5.0,101467.0,2018-10-01 00:00:10,116.34,39.90,116.34,39.90,0.000000,...,12,q,45,0,0,1,0,0,0,2
1,722886.0,2018-10-01 01:46:09,1.0,101467.0,2018-10-01 01:46:06,116.39,39.87,116.39,39.91,0.040000,...,12,q,45,0,0,1,0,0,0,2
2,785617.0,2018-10-01 02:25:23,1.0,101467.0,2018-10-01 02:25:19,116.37,39.86,116.39,39.91,0.053852,...,12,q,45,0,0,1,0,0,0,2
3,785618.0,2018-10-01 02:37:48,3.0,101467.0,2018-10-01 02:37:46,116.37,39.89,116.39,39.91,0.028284,...,12,q,45,0,0,1,0,0,0,2
4,753079.0,2018-10-01 08:00:01,1.0,101467.0,2018-10-01 07:58:52,116.34,39.90,116.39,39.78,0.130000,...,12,q,45,0,0,1,0,0,0,2
5,785168.0,2018-10-01 09:30:57,9.0,101467.0,2018-10-01 09:30:52,116.39,39.98,116.40,39.91,0.070711,...,12,q,45,0,0,1,0,0,0,2
6,785167.0,2018-10-01 09:37:24,2.0,101467.0,2018-10-01 09:37:18,116.39,39.98,116.40,39.93,0.050990,...,12,q,45,0,0,1,0,0,0,2
7,785169.0,2018-10-01 09:40:30,2.0,101467.0,2018-10-01 09:40:28,116.39,39.98,116.40,39.93,0.050990,...,12,q,45,0,0,1,0,0,0,2
8,720532.0,2018-10-01 10:09:28,11.0,101467.0,2018-10-01 10:09:11,116.54,39.76,116.41,39.64,0.176918,...,12,q,45,0,0,1,0,0,0,2
9,729778.0,2018-10-01 10:11:19,1.0,101467.0,2018-10-01 10:10:49,116.19,39.93,116.32,40.05,0.176918,...,12,q,45,0,0,1,0,0,0,2


In [45]:
from datetime import datetime
datetime.now().strftime('%Y-%m-%d %H:%M:%S')

'2019-07-13 11:40:26'

In [47]:
from datetime import datetime

with(open("../data/interim/task2/" + target_col + str(datetime.now().strftime('%m-%d-%H-%M-%S')) + ".txt", "w+")) as fo:
    for i in range(2):
        fo.writelines("F1 Score at split {} is {:.2f} %\n".format(i, 2))

In [48]:
features = pd.read_pickle('../data/processed/features/task_2.pickle')

In [51]:
features

['o_long',
 'o_lat',
 'd_long',
 'd_lat',
 'mode_0_available',
 'mode_1_available',
 'mode_2_available',
 'mode_3_available',
 'mode_4_available',
 'mode_5_available',
 'mode_6_available',
 'mode_7_available',
 'mode_8_available',
 'mode_9_available',
 'mode_10_available',
 'mode_11_available',
 'max_dist',
 'min_dist',
 'mean_dist',
 'std_dist',
 'max_price',
 'min_price',
 'mean_price',
 'std_price',
 'max_eta',
 'min_eta',
 'mean_eta',
 'std_eta',
 'max_dist_mode',
 'min_dist_mode',
 'max_price_mode',
 'min_price_mode',
 'max_eta_mode',
 'min_eta_mode',
 'first_mode',
 'weekday',
 'hour',
 'distance_query',
 'dist_nearest_sub',
 'weather_dyq',
 'weather_qdy',
 'weather_dy',
 'weather_q',
 'weather_xydy',
 'weather_xq',
 'max_temp',
 'min_temp',
 'wind',
 'req_weekend',
 'is_holiday',
 'label']

In [54]:
features = [
    'o_long',
 'o_lat',
 'd_long',
 'd_lat',
 'mode_0_available',
 'mode_1_available',
 'mode_2_available',
 'mode_3_available',
 'mode_4_available',
 'mode_5_available',
 'mode_6_available',
 'mode_7_available',
 'mode_8_available',
 'mode_9_available',
 'mode_10_available',
 'mode_11_available',
 'max_dist',
 'min_dist',
 'mean_dist',
 'std_dist',
 'max_price',
 'min_price',
 'mean_price',
 'std_price',
 'max_eta',
 'min_eta',
 'mean_eta',
 'std_eta',
 'max_dist_mode',
 'min_dist_mode',
 'max_price_mode',
 'min_price_mode',
 'max_eta_mode',
 'min_eta_mode',
 'first_mode',
 'weekday',
 'hour',
 'distance_query',
 'dist_nearest_sub',
 'weather_dyq',
 'weather_qdy',
 'weather_dy',
 'weather_q',
 'weather_xydy',
 'weather_xq',
 'max_temp',
 'min_temp',
 'wind',
 'req_weekend',
 'is_holiday'
]

In [59]:
import pickle
with(open('../data/interim/task2/features.pickle', 'wb')) as fo:
    pickle.dump(features, fo)