In [None]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

# Data Visualiation
import matplotlib.pyplot as plt
import seaborn as sns

# System
import os

# Deep Learning
import tensorflow
from typing import Dict, List, Tuple, Sequence

from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers, metrics
from tensorflow.keras.regularizers import L1L2
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from sklearn.model_selection import train_test_split

In [None]:
file_path="~/code/Agermita/intelligent_network_expansion/raw_data/data_finale_V4.csv"

processed_data = pd.read_csv(file_path, sep=',')
processed_data.columns

In [None]:
processed_data.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
processed_data['Date'] = pd.to_datetime(processed_data['Date'], format='%Y-%m-%d')
processed_data.dtypes

In [None]:
def create_X_y(df) ->np.array :
    cells=df[["eNodeB identity",'Cell ID','eNodeB_identifier_int']].sort_values(by='eNodeB_identifier_int')
    cells=cells.drop_duplicates()
    
    
    data=[]
    data_y=[]
    for index, row in cells.iterrows():
        df_cell=df[(df["eNodeB identity"]==row[0]) & (df["Cell ID"]==row[1])]
        df_cell=df_cell.sort_values(by='eNodeB_identifier_int')
        #df_cell=df_cell.reset_index(drop=True)
        
        #df_cell=replace_missing_dates(df_cell, start_date, end_date)
        """ for modeling, the X and y should not contain sequences identty (eNodeB id and cell id)"""
        df_cell.drop('eNodeB identity', axis=1, inplace=True)
        df_cell.drop('Cell ID', axis=1, inplace=True)
        df_cell.drop('Date', axis=1, inplace=True)
        df_cell.drop('eNodeB_identifier_int', axis=1, inplace=True)
        
        """---------------------"""
        df_cell_y=df_cell['Trafic LTE.float']
        df_cell.drop('Trafic LTE.float', axis=1, inplace=True)
        
        data.append(df_cell)
        data_y.append(df_cell_y)

    X=np.array(data)
    y=np.array(data_y)
    y=np.expand_dims(np.array(y), axis=2)
    return cells, X , y

In [None]:
# function to get data related to each cell in an array of 3 dimensions (nb cells, nb days, columns)
def get_cells_data(df) ->np.array :
    cells=df[["eNodeB identity",'Cell ID','eNodeB_identifier_int']].sort_values(by='eNodeB_identifier_int')
    cells=cells.drop_duplicates()
    
    
    data=[]
    
    for index, row in cells.iterrows():
        df_cell=df[(df["eNodeB identity"]==row[0]) & (df["Cell ID"]==row[1])]
        df_cell=df_cell.sort_values(by='eNodeB_identifier_int')
        df_cell=df_cell.reset_index(drop=True)
        
        df_cell.drop('Trafic LTE.float', axis=1, inplace=True)
        
        data.append(df_cell)
        

    cells_data=np.array(data)
    
    return cells, cells_data

In [None]:
cells, cells_data=get_cells_data(processed_data)
cell_0_data=pd.DataFrame(cells_data[0])

In [None]:
def train_test_split(data_cell:pd.DataFrame,
                     train_test_ratio: float,
                     input_length: int) -> Tuple[pd.DataFrame]:
    """From a fold dataframe, take a train dataframe and test dataframe based on 
    the split ratio.
    - df_train should contain all the timesteps until round(train_test_ratio * len(fold))
    - df_test should contain all the timesteps needed to create all (X_test, y_test) tuples

    Args:
        fold (pd.DataFrame): A fold of timesteps
        train_test_ratio (float): The ratio between train and test 0-1
        input_length (int): How long each X_i will be : 3 month 90 days

    Returns:
        Tuple[pd.DataFrame]: A tuple of two dataframes (fold_train, fold_test)
    """
    # $CHALLENGIFY_BEGIN
    
    # TRAIN SET
    # ======================
    last_train_idx = round(train_test_ratio * len(data_cell))
    data_cell_train = data_cell.iloc[0:last_train_idx, :]

    # TEST SET
    # ======================    
    first_test_idx = last_train_idx - input_length
    data_cell_test = fold.iloc[first_test_idx:, :]

    return (data_cell_train, data_cell_test)

    # $CHALLENGIFY_END

In [None]:
(data_cell0_train, data_cell0_test)=train_test_split(cell_0_data, 0.8, 90)
# 90 days

In [None]:
def get_Xi_yi(
    data_cell:pd.DataFrame, 
    input_length:int, 
    output_length:int) -> Tuple[pd.DataFrame]:
    """given a fold, it returns one sequence (X_i, y_i) as based on the desired 
    input_length and output_length with the starting point of the sequence being chosen at random based

    Args:
        fold (pd.DataFrame): A single fold
        input_length (int): How long each X_i should be --> 3 months
        output_length (int): How long each y_i should be --> 1 month

    Returns:
        Tuple[pd.DataFrame]: A tuple of two dataframes (X_i, y_i)
    """
    #drop unnecessary columns "Date", "eNodeB identity", "Cell ID", "eNodeB_identifier_int"  
    data_cell.drop(data_cell.columns[[0, 1, 2, 8]], axis=1, inplace=True) 
    # $CHALLENGIFY_BEGIN
    first_possible_start = 0
    last_possible_start = len(data_cell) - (input_length + output_length) + 1
    random_start = np.random.randint(first_possible_start, last_possible_start)
    X_i = data_cell.iloc[random_start:random_start+input_length]  
     
    X_i.drop(X_i.columns[[0]], axis=1, inplace=True) # delete original trafic column
    y_i = data_cell.iloc[random_start+input_length:
                  random_start+input_length+output_length][[0]]
    
    return (X_i, y_i)
    # $CHALLENGIFY_END

In [None]:
def get_X_y(
    cell_data:pd.DataFrame,
    number_of_sequences:int,
    input_length:int,
    output_length:int) -> Tuple[np.array]:
    """Given a fold generate X and y based on the number of desired sequences 
    of the given input_length and output_length

    Args:
        fold (pd.DataFrame): Fold dataframe
        number_of_sequences (int): The number of X_i and y_i pairs to include
        input_length (int): Length of each X_i
        output_length (int): Length of each y_i

    Returns:
        Tuple[np.array]: A tuple of numpy arrays (X, y)
    """
    # $CHALLENGIFY_BEGIN    
    X, y = [], []

    for i in range(number_of_sequences):
        (Xi, yi) = get_Xi_yi(cell_data, input_length, output_length)
        X.append(Xi)
        y.append(yi)
        
    return np.array(X), np.array(y)
    # $CHALLENGIFY_END

In [None]:
N_TRAIN=9
INPUT_LENGTH=90
OUTPUT_LENGTH=30
X_train, y_train = get_X_y(data_cell0_train, N_TRAIN, INPUT_LENGTH, OUTPUT_LENGTH)
X_test, y_test = get_X_y(data_cell0_test, N_TEST, INPUT_LENGTH, OUTPUT_LENGTH)

In [None]:
#Function to check if we have all days for each cell
def cells_dim(df) ->np.array :
    cells=df[["eNodeB identity",'Cell ID','eNodeB_identifier_int']].sort_values(by='eNodeB_identifier_int')
    cells=cells.drop_duplicates()    
    cells_days=[]
    for index, row in cells.iterrows():
        df_cell=df[(df["eNodeB identity"]==row[0]) & (df["Cell ID"]==row[1])]
        cells_days.append(df_cell.shape[0])
    return cells_days

In [None]:
cells_days=cells_dim(processed_data)

In [None]:
min(cells_days), max(cells_days)

In [None]:
cells,X_cells,y_cells=create_X_y(processed_data)

In [None]:
X.shape, y.shape

In [None]:
def split_train_test(X,y, ratio=0.8) ->tuple:

    X_train, X_test = np.split(X, [int(ratio*len(X))])
    y_train, y_test = np.split(y, [int(ratio*len(y))])
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test=split_train_test(X,y, ratio=0.8)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
def initialize_model(input_shape: tuple, output_length) -> models:
    #output_length = y_train.shape[1]
    #input_shape =(X.shape[1],X.shape[2])
    """
    Initialize the Neural Network with random weights
    """

    model = models.Sequential()
    ## 1.1 - Recurrent Layer
    model.add(layers.Masking(mask_value=-10, input_shape=input_shape))
    """model.add(layers.LSTM(units=64,
                        activation='relu',
                        return_sequences = True,
                        kernel_regularizer=L1L2(l1=0.05, l2=0.05),
                        ))
    """
    model.add(layers.GRU(units=64,
                        activation='relu',
                        return_sequences = True
                        ))
    ## 1.2 - Hidden Dense Layers
    """
    model.add(layers.Dense(32, activation="relu", kernel_regularizer = L1L2(l1=0.05, l2=0.05)))
    model.add(layers.Dropout(rate=0.5))
    """

    ## 1.2 - Predictive Dense Layers

    model.add(layers.Dense(output_length, activation='linear'))

    return model


def compile_model(model: models) -> models:
    """
    Compile the Neural Network
    """

    # 2 - Compiler
    # ======================
    initial_learning_rate = 0.01

    #lr_schedule = ExponentialDecay(initial_learning_rate, decay_steps=1000, decay_rate=0.5)

    #adam = optimizers.Adam(learning_rate=initial_learning_rate)
    #model.compile(loss='mse', optimizer="adam", metrics=["mae"])
    model.compile(loss='mse', optimizer="adam", metrics=['mae', 'mape'])
    return model

def train_model(
        model: models,
        X: np.ndarray,
        y: np.ndarray,
        patience=2,
        validation_data=None, # overrides validation_split
        validation_split_rate=0.3
    ) -> Tuple[tensorflow.keras.Model, dict]:
    """
    Fit the model and return a tuple (fitted_model, history)
    """
     # $CHALLENGIFY_BEGIN
    es = EarlyStopping(monitor = "val_loss",
                      patience = patience,
                      mode = "min",
                      restore_best_weights = True)


    history = model.fit(X, y,
                        validation_split=validation_split_rate,    
                        shuffle = False,
                        batch_size = 32,
                        epochs = 50,
                        callbacks = [es],
                        verbose = 1)

    return model, history

In [None]:
input_shape =(X_train.shape[1],X_train.shape[2])
output_length = y_train.shape[1]
input_shape, output_length

In [None]:
model=initialize_model(input_shape, output_length)

In [None]:
model.summary()

In [None]:
model=compile_model(model)

In [None]:
model, history=train_model(
        model,
        X_train,
        y_train,
        2,
        None, # don't use validation data, use validation split rate
        0.3
    )

In [None]:
def plot_history(history):

    fig, ax = plt.subplots(1,2, figsize=(20,7))
    # --- LOSS: MSE ---
    ax[0].plot(history.history['loss'])
    ax[0].plot(history.history['val_loss'])
    ax[0].set_title('MSE')
    ax[0].set_ylabel('Loss')
    ax[0].set_xlabel('Epoch')
    ax[0].legend(['Train', 'Validation'], loc='best')
    ax[0].grid(axis="x",linewidth=0.5)
    ax[0].grid(axis="y",linewidth=0.5)

    # --- METRICS:MAE ---

    ax[1].plot(history.history['mae'])
    ax[1].plot(history.history['val_mae'])
    ax[1].set_title('MAE')
    ax[1].set_ylabel('MAE')
    ax[1].set_xlabel('Epoch')
    ax[1].legend(['Train', 'Validation'], loc='best')
    ax[1].grid(axis="x",linewidth=0.5)
    ax[1].grid(axis="y",linewidth=0.5)

    return ax

In [None]:
plot_history(history);

In [None]:
res = model.evaluate(X_test, y_test)

In [None]:
res

In [None]:
y_pred=model.predict(X)

In [None]:
y_pred.shape

In [None]:
def data_hist_predicted(y, y_pred, cells, start_date, end_date):
    # start_date=processed_data['Date'].min()
    # end_date=processed_data['Date'].max()
    end_date_2=end_date+datetime.timedelta(days=y_pred.shape[1]) 
    dates=pd.date_range(start = start_date, end = end_date_2)
    # reshape the y to the 2d
    y_reshaped_2d=y
    y_reshaped_2d=y_reshaped_2d.reshape(-1, y.shape[1])
    # convert y and y_pred to Dataframe    
    list_y_pred=pd.DataFrame(y_pred)
    list_y=pd.DataFrame(y_reshaped_2d)
    #add column to have distinction petween real trafic and predicted trafic
    list_y_pred['period_trafic'] = pd.Series(["predicted trafic" for x in range(len(list_y_pred.index))])
    list_y['period_trafic'] = pd.Series(["real trafic" for x in range(len(list_y.index))])
    
    cells=cells.reset_index(drop=True) # to be deleted after
    #concatenate y and y_pred and cell ids
    cell_data=pd.concat([list_y,list_y_pred],axis=1, ignore_index=True, sort=False)
    cell_data_final=pd.concat([cells,cell_data],axis=1, sort=False)
    # rename columns
    columns=['eNodeB identity','Cell ID','eNodeB_identifier_int']
    comunms_all=columns+(list(dates))
    cell_data_final.columns=comunms_all
    cell_data_final.set_index(['eNodeB identity','Cell ID','eNodeB_identifier_int'])
    
    # format the  dataframe to have dates as one column
    cell_data_final2=cell_data_final.melt(id_vars=['eNodeB identity','Cell ID','eNodeB_identifier_int'], 
        var_name="Date", 
        value_name="Trafic")
    
    return cell_data_final2

In [None]:
start_date=processed_data['Date'].min()
end_date=processed_data['Date'].max()
df_trafic_predicted=data_hist_predicted(y, y_pred, cells, start_date, end_date)

In [None]:
file_path="~/code/Agermita/intelligent_network_expansion/raw_data/data_finale_prediction.csv"

df_trafic_predicted.to_csv(file_path, sep=';')