# Running some tests

### Doc - Datetime functions

In [12]:
if False:
    # Description of all datetime operations
    d = datetime.datetime.now()
    print(d)
    # Jour dans le mois
    print(d.day)
    # Numéro du mois
    print(d.month)
    # Année
    print(d.year)
    # Jour travaillé (0/1)
    print(d.isoweekday())
    # Heure
    print(d.hour)
    # Minutes
    print(d.minute)
    # Secondes
    print(d.second)

# Feature Extraction

## Feature functions

In [1]:
# Few functions to add features to the dataframe.


def is_working_day(d):
    return d.isoweekday()<6

def trimester(d):
    return (d.month-1)//3

def is_day_shift(d):
    min_hour = {
        'hour':7,
        'minute':30
    }
    max_hour = {
        'hour':23,
        'minute':30
    }
    hour_day = d.hour*60+d.minute
    return (hour_day > min_hour['hour']*60+min_hour['minute']) & (hour_day < max_hour['hour']*60+max_hour['minute'])

def evolution_over_years(d):
    min_date = {
        "year": 2011,
        "month": 1,
        "day": 1
    }
    date_trimester = trimester(d)
    year = d.year
    return (year-min_date['year'])*4+date_trimester

## Libraries

In [33]:
# All library imports

import math
import pandas as pd
import numpy as np
import error_functions as ef
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [3]:
# Map to convert ass_assignments to integer values
assignments = ['CAT', 'CMS', 'Crises', 'Domicile',
       'Evenements', 'Gestion', 'Gestion - Accueil Telephonique',
       'Gestion Amex', 'Gestion Assurances', 'Gestion Clients', 'Gestion DZ',
       'Gestion Relation Clienteles', 'Gestion Renault', 'Japon', 'Manager',
       'Mécanicien', 'Médical', 'Nuit', 'Prestataires', 'RENAULT', 'RTC',
       'Regulation Medicale', 'SAP', 'Services', 'Tech. Axa', 'Tech. Inter',
       'Tech. Total', 'Téléphonie']

map_assignment = {}
for i in range(len(assignments)):
    map_assignment[assignments[i]]=i
    
map_assignment_inverse = {}
for i in range(len(assignments)):
    map_assignment_inverse[i] = assignments[i]

In [4]:
print("Importing train values...")
calls = pd.read_csv('data/train.csv', delimiter=';')
print("Reading train file ok.")
y = calls['CSPL_CALLS']
print("Extracting results ok.")
print("Importing submission values...")
submission = pd.read_csv('data/submission.txt', sep="\t")
print("Reading submission file ok.")
submission = submission.drop('prediction', axis=1)
print("Dropping prediction from submission ok.")

Importing train values...


  interactivity=interactivity, compiler=compiler, result=result)


Reading train file ok.
Extracting results ok.
Importing submission values...
Reading submission file ok.
Dropping prediction from submission ok.


In [5]:
calls['DATE'] = pd.to_datetime(calls['DATE'],infer_datetime_format=True)
print("Train date to timetable ok.")
submission['DATE'] = pd.to_datetime(submission['DATE'],infer_datetime_format=True)
print("Submission date to timetable ok.")

Train date to timetable ok.
Submission date to timetable ok.


In [6]:
calls = calls[["DATE", "ASS_ASSIGNMENT"]]
print("Extraction of columns date and ass_assignment ok.")
print("")
print("Adding new features to train...")
calls['year'] = [dd.year for dd in calls['DATE']]
calls['month'] = [dd.month for dd in calls['DATE']]
calls['day'] = [dd.day for dd in calls['DATE']]
print("Year, month, day ok.")
calls['hour'] = [dd.hour for dd in calls['DATE']]
calls['minute'] = [dd.minute for dd in calls['DATE']]
#calls['second'] = [dd.second for dd in calls['DATE']]
print("Hour, minute ok.")
calls['working'] = [is_working_day(dd) for dd in calls['DATE']]
print("Working day ok.")
calls['shift'] = [is_day_shift(dd) for dd in calls['DATE']]
print("Shift ok.")
calls['trimester'] = [trimester(dd) for dd in calls['DATE']]
print("Trimester ok.")
calls['evolution'] = [evolution_over_years(dd) for dd in calls['DATE']]
print("Evolution ok.")
calls['assignment'] = calls['ASS_ASSIGNMENT'].map(map_assignment)
print("Creating dummies ok.")
calls = calls.drop(['DATE','ASS_ASSIGNMENT'], axis=1)
print("Dropping string columns ok.")
print("")
print("Adding new features to submission...")
submission['year'] = [dd.year for dd in submission['DATE']]
submission['month'] = [dd.month for dd in submission['DATE']]
submission['day'] = [dd.day for dd in submission['DATE']]
print("Year, month, day ok.")
submission['hour'] = [dd.hour for dd in submission['DATE']]
submission['minute'] = [dd.minute for dd in submission['DATE']]
#submission['second'] = [dd.second for dd in submission['DATE']]
print("Hour, minute ok.")
submission['working'] = [is_working_day(dd) for dd in submission['DATE']]
print("Working day ok.")
submission['shift'] = [is_day_shift(dd) for dd in submission['DATE']]
print("Shift ok.")
submission['trimester'] = [trimester(dd) for dd in submission['DATE']]
print("Trimester ok.")
submission['evolution'] = [evolution_over_years(dd) for dd in submission['DATE']]
print("Evolution ok.")
submission['assignment'] = submission['ASS_ASSIGNMENT'].map(map_assignment)
print("Creating dummies ok.")
save_date = submission['DATE']
print("Saving date column ok.")
save_ass_assignment = submission['ASS_ASSIGNMENT']
print("Saving ass_assignment ok.")
submission = submission.drop(['DATE','ASS_ASSIGNMENT'], axis=1)
print("Dropping string columns ok.")

Extraction of columns date and ass_assignment ok.

Adding new features to train...
Year, month, day ok.
Hour, minute ok.
Working day ok.
Shift ok.
Trimester ok.
Evolution ok.
Creating dummies ok.
Dropping string columns ok.

Adding new features to submission...
Year, month, day ok.
Hour, minute ok.
Working day ok.
Shift ok.
Trimester ok.
Evolution ok.
Creating dummies ok.
Saving date column ok.
Saving ass_assignment ok.
Dropping string columns ok.


In [7]:
for i in range(len(assignments)):
    tmp = calls[calls['assignment']==i]
    count = len(tmp)
    print(map_assignment_inverse[i], ":", count)

CAT : 41206
CMS : 18803
Crises : 26652
Domicile : 193886
Evenements : 662
Gestion : 48093
Gestion - Accueil Telephonique : 126677
Gestion Amex : 13507
Gestion Assurances : 40879
Gestion Clients : 33614
Gestion DZ : 31931
Gestion Relation Clienteles : 32149
Gestion Renault : 46285
Japon : 92392
Manager : 91824
Mécanicien : 53768
Médical : 373492
Nuit : 192868
Prestataires : 15354
RENAULT : 359561
RTC : 43123
Regulation Medicale : 92274
SAP : 47661
Services : 410137
Tech. Axa : 416986
Tech. Inter : 427524
Tech. Total : 219987
Téléphonie : 7387175


# Implementing regressor

## Basic cross-validation with one regressor

In [8]:
print("------------------------")
print("--- CROSS VALIDATION ---")
print("")
print("Creating regressor...")
X_train, X_test, y_train, y_test = train_test_split(calls, y, test_size=82909)
print("Splitting train from test ok.")
S=Ridge()
print("Instanciating regressor ok.")
S.fit(X_train, y_train)
print("Training phase ok.")
y2=S.predict(X_test)
print("Prediction ok.")
y_test = np.array(y_test)
print("Pandas series to numpy array ok.")
loss = ef.linex_loss(y2, y_test)
print("Compute of loss ok : ", loss)

------------------------
--- CROSS VALIDATION ---

Creating regressor...
Splitting train from test ok.
Instanciating regressor ok.
Training phase ok.
Prediction ok.
Pandas series to numpy array ok.
Compute of loss ok :  1.48235997787e+13


## Multiple regressors over ASS_ASSIGNMENT category

In [None]:
map_regressor = {}
for i in range(len(assignments)):
    map_regressor[i] = RandomForestRegressor(n_estimators=300)
    
print("--- TRAINING ---")
X_train, X_test, y_train, y_test = train_test_split(calls, y, test_size=82909)
X_train_full = X_train
X_train_full['pred'] = y_train
X_test_full = X_test
X_test_full['pred'] = y_test
for i in range(len(assignments)):
    el_x = X_train_full[X_train_full['assignment']==i]
    el_y = el_x['pred']
    el_x = el_x.drop("pred", axis=1)
    map_regressor[i].fit(el_x, el_y)

--- TRAINING ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [32]:
error = 0
for i in range(len(assignments)):
    el_x = X_test_full[X_test_full['assignment']==i]
    size = len(el_x)
    el_y = el_x['pred']
    el_x = el_x.drop('pred', axis=1)
    y2 = map_regressor[i].predict(el_x)
    el_y = np.array(el_y)
    y2 = np.round(np.array(y2))

    error_tmp = ef.linex_loss(np.array(y2), np.array(el_y))
    print(map_assignment_inverse[i], " - ", (10 < np.abs(np.array(y2)-np.array(el_y))).sum(), "/", size)
    imax = np.argmax(np.abs(np.array(y2)-np.array(el_y)))
    print("\t\t\t\t", np.max(np.abs(np.array(y2)-np.array(el_y))), "\t", y2[imax], "\t", el_y[imax])
    error += error_tmp
    
print("size ", size)
print("loss ", error)

CAT  -  261 / 315
				 118.0 	 43.0 	 161
CMS  -  0 / 143
				 1.0 	 0.0 	 1
Crises  -  0 / 173
				 0.0 	 0.0 	 0
Domicile  -  59 / 1508
				 36.0 	 4.0 	 40
Evenements  -  6 / 8
				 248.0 	 208.0 	 456
Gestion  -  0 / 376
				 1.0 	 0.0 	 1
Gestion - Accueil Telephonique  -  12 / 954
				 38.0 	 2.0 	 40
Gestion Amex  -  0 / 81
				 2.0 	 0.0 	 2
Gestion Assurances  -  0 / 276
				 9.0 	 1.0 	 10
Gestion Clients  -  0 / 236
				 4.0 	 0.0 	 4
Gestion DZ  -  0 / 226
				 2.0 	 0.0 	 2
Gestion Relation Clienteles  -  0 / 262
				 7.0 	 0.0 	 7
Gestion Renault  -  0 / 364
				 2.0 	 0.0 	 2
Japon  -  0 / 653
				 5.0 	 0.0 	 5
Manager  -  0 / 703
				 1.0 	 0.0 	 1
Mécanicien  -  0 / 398
				 4.0 	 0.0 	 4
Médical  -  11 / 2850
				 18.0 	 2.0 	 20
Nuit  -  10 / 1404
				 54.0 	 1.0 	 55
Prestataires  -  0 / 118
				 1.0 	 0.0 	 1
RENAULT  -  11 / 2748
				 54.0 	 1.0 	 55
RTC  -  5 / 312
				 19.0 	 3.0 	 22
Regulation Medicale  -  0 / 749
				 7.0 	 0.0 	 7
SAP  -  0 / 364
				 10.0 

## Multiple regressors over SHIFT (Day/Night) category

In [61]:
print("------------------------")
print("--- CROSS VALIDATION ---")
print("")
separate_calls = calls
separate_calls['call'] = y
day_calls = separate_calls[separate_calls['shift']==1]
y_day = day_calls['call']
day_calls.drop('call', axis=1)
night_calls = separate_calls[separate_calls['shift']==0]
y_night = night_calls['call']
night_calls.drop('call', axis=1)

print("Creating regressor...")

X_day_train, X_day_test, y_day_train, y_day_test = train_test_split(day_calls, y_day, test_size=56590)
X_night_train, X_night_test, y_night_train, y_night_test = train_test_split(night_calls, y_night, test_size=26319)
print("Splitting train from test ok.")

day_regressor = Ridge()
night_regressor = Ridge()
print("Instanciating regressors (day/night) ok.")

day_regressor.fit(X_day_train, y_day_train)
night_regressor.fit(X_night_train, y_night_train)
print("Training phase ok.")

y2_night = night_regressor.predict(X_night_test)
X_night_test['pred']=y2_night
y2_day = day_regressor.predict(X_day_test)
X_day_test['pred']=y2_day
new_calls = pd.concat([X_night_test, X_day_test], axis=0)
y2 = new_calls['pred']
print("Prediction ok.")

y_test = np.array(y_test)
y2 = np.array(y2)
print("Pandas series to numpy array ok.")
loss = ef.linex_loss(y2, y_test)
print("Compute of loss ok : ", loss)

------------------------
--- CROSS VALIDATION ---

Creating regressor...
Splitting train from test ok.
Instanciating regressors (day/night) ok.
Training phase ok.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Prediction ok.
Pandas series to numpy array ok.
Compute of loss ok :  177141729136.0


## Regression over submission values

In [None]:
print("------------------")
print("--- SUBMISSION ---")
print("")
S=Ridge()
print("Instanciating regressor ok.")
S.fit(calls, y)
print("Training phase ok.")
y2=S.predict(submission)
print("Prediction ok.")
for i, el in np.ndenumerate(y2):
    if el < 0:
        y2[i] = 0
    else:
        y2[i] = int(math.floor(y2[i]))
print("Taking floor value of prediction")
submission['DATE'] = save_date
submission['ASS_ASSIGNMENT'] = save_ass_assignment
submission['prediction'] = y2
print("Adding date, ass_assignment, prediction to dataframe ok.")
submission = submission[['DATE', 'ASS_ASSIGNMENT', 'prediction']]
print("Dropping all other columns ok.")
submission.to_csv("submission_test.txt", sep="\t", index=False)
print("Writing to file ok.")