In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.colors import LinearSegmentedColormap
import pylab
import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras import optimizers
from keras.layers import Activation
from keras.layers import Dropout, Dense, LSTM
from keras.optimizers import Adam

In [None]:
#Download Data Frame
data = pd.read_csv('CL_megafon_Cup IT_2020_final_data.csv', sep = ';', parse_dates = ['time_start', 'time_end'])

In [None]:
#Download test Data Frame
test = pd.read_csv('test.csv', sep = ';', parse_dates = ['time_start'])

In [None]:
predictions = test.copy()
predictions['month_start'] = predictions.time_start.apply(lambda x: x.month)
predictions['day_of_week_start'] = predictions.time_start.apply(lambda x: x.dayofweek+1)
predictions['hour_start'] = predictions.time_start.apply(lambda x: x.hour)
predictions.index = predictions['time_start']
predictions = predictions.drop(['time_start'], axis = 1)

In [None]:
#Preparing data to work with time series
data['month_start'] = data.time_start.apply(lambda x: x.month)
data['day_of_week_start'] = data.time_start.apply(lambda x: x.dayofweek+1)
data['hour_start'] = data.time_start.apply(lambda x: x.hour)
data['gap'] = data.time_end-data.time_start
data['gap'] = data.gap.apply(lambda x: x.seconds)
data = data.drop(['time_end'], axis = 1)
data = data.sort_values('time_start')

In [None]:
test['month_start'] = test.time_start.apply(lambda x: x.month)
test['day_of_week_start'] = test.time_start.apply(lambda x: x.dayofweek+1)
test['hour_start'] = test.time_start.apply(lambda x: x.hour)

In [None]:
#Data Frame with columns, that have useful infromation
data.head()

In [None]:
#New Data Frame for saving old
newdata = data.copy()

In [None]:
#Min and max values of all important features for normalize values
min_place_latitude = newdata.place_latitude.min()
max_place_latitude = newdata.place_latitude.max()
min_place_longitude = newdata.place_longitude.min()
max_place_longitude = newdata.place_longitude.max()
min_m = newdata.month_start.min()
max_m = newdata.month_start.max()
min_d = newdata.day_of_week_start.min()
max_d = newdata.day_of_week_start.max()
min_h = newdata.hour_start.min()
max_h = newdata.hour_start.max()
min_gap = newdata.gap.min()
max_gap = newdata.gap.max()

In [None]:
#Functions of normalize values
def scaled(x):
    r = x
    r[1] = (r[1]-min_place_latitude)/(max_place_latitude-min_place_latitude)
    r[2] = (r[2]-min_place_longitude)/(max_place_longitude-min_place_longitude)
    r[3] = (r[3]-min_m)/(max_m-min_m)
    r[4] = (r[4]-min_d)/(max_d-min_d)
    r[5] = (r[5]-min_h)/(max_h-min_h)
    r[6] = (r[6]-min_gap)/(max_gap-min_gap)
    return r
def scaled_test(x):
    r = x
    r[1] = (r[1]-min_place_latitude)/(max_place_latitude-min_place_latitude)
    r[2] = (r[2]-min_place_longitude)/(max_place_longitude-min_place_longitude)
    r[3] = (r[3]-min_m)/(max_m-min_m)
    r[4] = (r[4]-min_d)/(max_d-min_d)
    r[5] = (r[5]-min_h)/(max_h-min_h)
    return r

In [None]:
#Making train data with normalize values for predicting
train_data = newdata.apply(scaled, axis = 1)

In [None]:
test_data = test.apply(scaled_test, axis = 1)

In [None]:
#Making data for clastering location
claster_data = train_data.groupby(['place_latitude', 'place_longitude']).agg({'gap': 'mean'})\
.reset_index()[['place_latitude', 'place_longitude']]
clf = KMeans(init = 'k-means++', n_clusters=13, random_state = 241)
clf.fit(claster_data)
claster_data['claster'] = pd.Series(clf.labels_)

In [None]:
#Making new column with labels of Area
train_data = train_data.merge(claster_data, on = ['place_latitude', 'place_longitude'])
train_data.index = train_data['time_start']
train_data = train_data.drop(['time_start'], axis = 1)

In [None]:
test_data['claster'] = pd.Series(clf.predict(test_data[['place_latitude', 'place_longitude']]))

In [None]:
test_data.index = test_data['time_start']
test_data = test_data.drop(['time_start'], axis = 1)

In [None]:
#Our prepeared data
train_data

In [None]:
categories = np.unique(train_data['claster'])
colors = [plt.cm.tab10(i/float(len(categories)-1)) for i in range(len(categories))]
plt.figure(figsize=(16, 10), dpi= 80, facecolor='w', edgecolor='k')
for i, category in enumerate(categories):
    plt.scatter('place_latitude', 'place_longitude', 
                data=train_data.loc[train_data.claster==category].iloc[:,:2], 
                s=20, c=colors[i], label=str(category))

plt.xticks(fontsize=12); plt.yticks(fontsize=12)
plt.title("Scatterplot of Area Location", fontsize=22)
plt.legend(fontsize=12)    
plt.show()

In [None]:
categories = np.unique(test_data['claster'])
colors = [plt.cm.tab10(i/float(len(categories)-1)) for i in range(len(categories))]
plt.figure(figsize=(16, 10), dpi= 80, facecolor='w', edgecolor='k')
for i, category in enumerate(categories):
    plt.scatter('place_latitude', 'place_longitude', 
                data=test_data.loc[test_data.claster==category].iloc[:,:2], 
                s=20, c=colors[i], label=str(category))

plt.xticks(fontsize=12); plt.yticks(fontsize=12)
plt.title("Scatterplot of Area Location", fontsize=22)
plt.legend(fontsize=12)    
plt.show()

In [None]:
!pip install tensorflow==1.14

In [None]:
model = []

In [None]:
#Basic MODEL to predict time
for i in range(13):
    model.append('')
    X = train_data.query('claster==@i').drop(['gap', 'claster'], axis=1)
    y = train_data.query('claster==@i').gap
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    X_train, y_train = X_train.values, y_train.values
    X_test, y_test = X_test.values, y_test.values
    X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
    X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
    model[i] = Sequential()
    model[i].add(Activation('linear'))
    model[i].add(LSTM(128, return_sequences=True,
               input_shape=(X_train.shape[1], X_train.shape[2])))
    model[i].add(LSTM(64, return_sequences=True))
    model[i].add(LSTM(64, return_sequences=True))
    model[i].add(LSTM(32))
    model[i].add(Activation('softsign'))
    model[i].add(Dropout(0.5))
    model[i].add(Dense(1, activation='tanh'))
    adam = optimizers.Adam(lr=0.0003, amsgrad=False)
    model[i].compile(loss='mean_absolute_percentage_error', optimizer=adam)
    history = model[i].fit(X_train, y_train, epochs=20, batch_size=8, 
                              validation_data=(X_test, y_test), verbose=1, shuffle=False)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper right')
    plt.show()

In [None]:
#MAPE metric function
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
#Checking of score
for i in range(13):
    X = train_data.query('claster==@i').drop(['gap', 'claster'], axis=1)
    y = train_data.query('claster==@i').gap
    X_train, X_test1, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    X_train, y_train = X_train.values, y_train.values
    X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
    y_pred = np.array([k[0] for k in model[i].predict(X_train)])
    y_pred = y_pred*(max_gap-min_gap) + min_gap
    y_train = y_train*(max_gap-min_gap) + min_gap
    print(y_pred[:10]-y_train[:10])
    print(i, mean_absolute_percentage_error(y_train, y_pred))

In [None]:
Shapes = []
X_data = []
for i in range(13):
    X_data.append('')
    X_data[i] = test_data.query('claster==@i')
    Shapes.append(X_data[i].shape[0])

In [None]:
X_test = []
X_test.append(np.zeros((1788, 1, 5)))
X_test.append(np.zeros((13807, 1, 5)))
X_test.append(np.zeros((2764, 1, 5)))
X_test.append(np.zeros((2952, 1, 5)))
X_test.append(np.zeros((4539, 1, 5)))
X_test.append(np.zeros((3447, 1, 5)))
X_test.append(np.zeros((6977, 1, 5)))
X_test.append(np.zeros((5744, 1, 5)))
X_test.append(np.zeros((2884, 1, 5)))
X_test.append(np.zeros((4068, 1, 5)))
X_test.append(np.zeros((9460, 1, 5)))
X_test.append(np.zeros((1840, 1, 5)))
X_test.append(np.zeros((2303, 1, 5)))

In [None]:
X_pred = []
for i in range(13):
    X_pred.append(test_data.query('claster==@i'))
    X_pred[i] = X_pred[i].drop(['claster'], axis = 1).values
    X_pred[i] = X_pred[i].reshape((X_pred[i].shape[0], 1, X_pred[i].shape[1]))

In [None]:
for i in range(13):
    for j in range(len(X_pred[i])):
        for k in range(len(X_pred[i][j])):
            for p in range(len(X_pred[i][j][k])):
                X_test[i][j][k][p] = X_pred[i][j][k][p]

In [None]:
X_pred = []
for i in range(13):
    X_pred.append(np.array([k[0] for k in model[i].predict(X_test[i])]))
    X_pred[i] = X_pred[i][:Shapes[i]]

In [None]:
for i in range(13):
    X_pred[i] = (X_pred[i]*(max_gap-min_gap) + min_gap)/60

In [None]:
for i in range(13):
    X_data[i]['pred'] = X_pred[i]

In [None]:
pred_data_frame = pd.DataFrame(columns = ['time_start',	'place_latitude',	'place_longitude',	'month_start',	'day_of_week_start',	'hour_start',	'claster', 'pred'])
for i in range(13):
    pred_data_frame = pd.concat([pred_data_frame, X_data[i].reset_index()])

In [None]:
predictions = test_data.reset_index().merge(pred_data_frame, on=['time_start',	'place_latitude',	'place_longitude',	'month_start',	'day_of_week_start',	'hour_start',	'claster']).\
drop(['month_start',	'day_of_week_start',	'hour_start',	'claster'],axis=1)

In [None]:
predictions[['place_latitude',	'place_longitude']] = test[['place_latitude',	'place_longitude']]

In [None]:
new = test.copy()
new = new.drop(['month_start',	'day_of_week_start',	'hour_start'], axis = 1)

In [None]:
new['predict'] = predictions['pred']

In [None]:
new.head()

In [None]:
new.to_csv('CupIT2020-DS-SofaAnalyticssss.csv')