In [None]:
#importing modules

import warnings 
warnings.filterwarnings('ignore')

import time
t = time.time()

print('Importing startred...')

# base libraries
import os
import numpy as np
import pandas as pd
import re
from scipy import stats
from random import randint
from datetime import datetime


# visualization libraries
import matplotlib.pyplot as plt
import matplotlib 
%matplotlib inline
import seaborn as sns
import missingno as msno
import plotly.express as px


# preprocessing libraries


from sklearn.model_selection import (TimeSeriesSplit,
                                     GridSearchCV,
                                     RandomizedSearchCV,
                                     train_test_split, 
                                     KFold, 
                                     StratifiedKFold,
                                    cross_val_score)

from sklearn.preprocessing import (LabelEncoder,
                                   StandardScaler, 
                                   MinMaxScaler, 
                                   OrdinalEncoder)

from sklearn.feature_selection import SelectFromModel


# metrics
from sklearn.metrics import (mean_squared_error, 
                             r2_score, 
                             mean_absolute_error)
from sklearn.metrics import make_scorer


# modeling algos
from sklearn.linear_model import (LogisticRegression,
                                  Lasso, 
                                  ridge_regression,
                                  LinearRegression)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (AdaBoostRegressor, 
                              RandomForestRegressor,
                              VotingRegressor, 
                              GradientBoostingRegressor)
from xgboost import XGBRegressor
from lightgbm import (LGBMRegressor,
                      early_stopping)

from sklearn.base import clone ## sklearn base models for stacked ensemble model


#Interpretiability of the model
# import shap
# import eli5
# from eli5.sklearn import PermutationImportance


## misea
from sklearn.pipeline import make_pipeline
print('Done, All the required modules are imported. Time elapsed: {}sec'.format(time.time()-t))


# Data Loading and Initila insights

In [None]:
# loading data
customer = pd.read_csv('../input/mobile-usage-time-prediction/customers.csv', delimiter = ',', encoding = 'utf-8')
pings = pd.read_csv('../input/mobile-usage-time-prediction/pings.csv', delimiter = ',', encoding = 'utf-8')
test_df = pd.read_csv('../input/mobile-usage-time-prediction/test.csv', delimiter = ',', encoding = 'utf-8')

In [None]:
customer.head()

In [None]:
customer.info()

In [None]:
pings.head()

In [None]:
print('$'*10 + ' Data Summary and Inital peaks ' + '$'*10 )

print('\n'+'$'*15 + 'Shapes of Data ' + '$'*15+'\n' )

print('Shape of the Customer database: {}'.format(customer.shape))
print('Shape of the Pings dataset: {}'.format(pings.shape))
print('Shape of the Test dataset: {}'.format(test_df.shape))

print('\n'+'*'*50 + '\n')

print(' Head of Customer database '+ '\n')
print(customer.head())

print('\n' +'*'*50+ '\n')

print('Head of Ping dataset'+ '\n')
print(pings.head())

print('\n' +'*'*50 + '\n')

print('Head of Test dataset'+ '\n')
print(test_df.head())

print('\n' +'*'*50 + '\n')

**value sorting with respect to id and timestamp**

# EXTRACTING USEFULL INFORMATION FROM HIDDEN DATA

In [None]:
pings = pings.sort_values(by = ['id','timestamp']).reset_index(drop=True)
# temp_ping_df = pings.copy()
pings.drop_duplicates(inplace = True)
pings['timestamp_decode'] = pings['timestamp'].apply(lambda x: datetime.fromtimestamp(x))
pings.head()

In [None]:
## EXTRACTING DATE
pings['date'] = pings['timestamp_decode'].dt.date
pings.head()

In [None]:
# EXTRACTING HOURS SPENT ONLINE
pings['online_hours'] = (pings.groupby(by=['id','date'])['timestamp'].diff())/(60*60)
pings['online_hours']  =  pings['online_hours'].apply(lambda x: x if x< (2/60) else (2/60))
pings.head()

In [None]:
pings.fillna(0,inplace = True)
# MAKING DATA CAPABLE FOR MERGING

# **# creating our training data**

In [None]:
train_df= (pings.groupby(by = ['id','date'])['online_hours'].sum()).reset_index()
# IN ABOVE CODE ONES HAVING SAME "ID" & "DATE" ARE GROUPED TOGETHER AND THEIR SUM OF ONLINE OUR IS STORED

train_df['online_hours'] = round(train_df['online_hours'],1) #runding off hours
train_df.head()
print('Done, All the required modules are imported. Time elapsed: {}sec'.format(time.time()-t))

# installing an automatedinbuilt function for automatic visualization

In [None]:
# # installing autoviz for simple insights
!pip install autoviz #insatlling autoviz
!pip install xlrd
from autoviz.AutoViz_Class import AutoViz_Class

In [None]:
autoviz = AutoViz_Class().AutoViz('../input/mobile-usage-time-prediction/customers.csv')


In [None]:
# Okay everything ready, now lets see the min, max dates for train and test datasets Then move with concatination of both dataframes...

print('\n' +'*'*50 + '\n')
print('Minimum date and Maximum date for train dataset: {},{}'.format(train_df.date.min(), train_df.date.max()))
print('Minimum date and Maximum date for test dataset: {},{}'.format(test_df.date.min(), test_df.date.max()))
print('\n' +'*'*50 + '\n')
print('this step took: {}sec'.format(time.time()-t))

# more work on data


In [None]:
# MERGING THE CUSTOMER DATABASE WITH THE PINGS DATABASE
temp_df = pd.merge(left = customer, right = train_df, on = 'id', how = 'outer')

temp_df.dropna(inplace = True) # droping ROWS WITH MISSING VALUES
temp_df['gender'] = temp_df['gender'].replace({'MALE':1, 'FEMALE':0}) #TYPE CONVERSION

# CONVERTING DATE TO DATETIME TYPE
temp_df['date'] = pd.to_datetime(temp_df['date'])

## data and time related basic features
temp_df['day_name'] = temp_df['date'].dt.day_name()
temp_df['day'] = temp_df['date'].dt.day
temp_df['month'] = temp_df['date'].dt.month
temp_df['month_name'] = temp_df['date'].dt.month_name()
temp_df['year'] = temp_df['date'].dt.year
temp_df['dayofweek'] = temp_df['date'].dt.dayofweek
temp_df['week']= temp_df['date'].dt.week



week_names = {'Sunday':0,'Monday':'1','Tuesday':2,'Wednesday':3, 'Thursday':4,'Friday':5,'Saturday':6}
month_names = {'January':0, 'February':1,'March':2,'April':3,'May':4,'June':5,'July':6,
                'August':7, 'September':8,'October':9,'November':10,'December':11}

temp_df['day_name'] = temp_df['day_name'].map(week_names)
temp_df['month_name'] = temp_df['month_name'].map(month_names)

In [None]:
temp_df.head()

In [None]:
fig = px.histogram(temp_df, x='age', y ='online_hours', color='gender', marginal='box')
fig.show()

can be easily seen that male tens to have more sreen time than than females

In [None]:
fig = px.histogram(temp_df, x='number_of_kids', y ='online_hours', color='gender', marginal='box')
fig.show()

In [None]:
fig, (ax0, ax1) = plt.subplots(2, 1, figsize=(10, 13))
sns.barplot(temp_df["dayofweek"], temp_df["online_hours"],ax=ax0 , )
sns.barplot(temp_df["day"], temp_df["online_hours"], ax= ax1)


In [None]:
sns.lineplot(x='age', y='online_hours', data=temp_df, hue='gender')

In [None]:
# plt.plot(x='time', y='online_hours', data=temp_df)#, hue='gender'
# sns.stripplot(temp_df.time, temp_df.online_hours, jitter=0.25, size=8, linewidth=.5)

# GOING WITH REGULAR APPROACH OF REGRESSORS

In [None]:
### solving model like a typical regression problem without any considerations and feature engineering
X = temp_df.drop(columns = ['date','id','online_hours'])
y = temp_df['online_hours']

# train, testing data spliting
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 2021) #shuffle an02d randomized selection

In [None]:
temp_df.head()

In [None]:
X_train.shape
X_train.head()

In [None]:
# scaling
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

input_shape = [X_train.shape[1]]
model = keras.Sequential([
    layers.BatchNormalization(input_shape = input_shape),
    layers.Dense(20 ,activation= 'relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(8, activation = 'relu'),
    layers.Dense(10 )
])    

    
model.compile(
    loss = 'mse',
    optimizer = 'adam', 
    metrics = ['mean_squared_error']        
)

early_stopping = keras.callbacks.EarlyStopping(
    patience=5,
    min_delta=0.01,
    restore_best_weights=True,
)
history = model.fit(
    X_train, y_train,
#     validation_data=(X2_test, y2_test),
    validation_split= 0.2,
    batch_size=512,
    epochs=50,
    callbacks=[early_stopping]
)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();

In [None]:
mse, mae = model.evaluate(X_test , y_test)
print( "mean square error is :" , mse)
print( "mean abosulte error is :" , mae)


# adding tiem series features (lag and date)

In [None]:
## preparing the training data
data2 = temp_df.copy()

data2['last_day_onlinehours'] = data2.groupby(['id'])['online_hours'].shift(1) 
data2['last_day_diff'] = data2.groupby(['id'])['last_day_onlinehours'].diff() 
# data2['day']= temp_df['date'].dt.dayofyear

X2= data2.drop(['id','date','day_name'] , axis=1)
y2 = data2['online_hours']

X2_train,X2_test,y2_train,y2_test = train_test_split(X2,y2,test_size = 0.2) #shuffle and randomized selection
X2_train.fillna(X2.median().round(1), inplace=True)


In [None]:
X2.info()

In [None]:
X2.head()

In [None]:
# scaling
scaler = MinMaxScaler()
X2_train = scaler.fit_transform(X2_train)
X2_test  = scaler.transform(X2_test)

In [None]:
model = LGBMRegressor(n_estimators = 1000, objective ='regression', importance_type = 'gain')
model.fit(X2_train,y2_train)
preds2 = model.predict(X2_test)
error = mean_squared_error(y2_test, preds2)
print('score for normal regression model this im using time lag feature = ' , error)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

input_shape = [X2_train.shape[1]]
model = keras.Sequential([
    layers.BatchNormalization(input_shape = input_shape),
    layers.Dense(20 ,activation= 'relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(8, activation = 'relu'),
    layers.Dense(10 )
])    

    
model.compile(
    loss = 'mse',
    optimizer = 'adam', 
    metrics = ['mean_squared_error']        
)

early_stopping = keras.callbacks.EarlyStopping(
    patience=5,
    min_delta=0.01,
    restore_best_weights=True,
)
history = model.fit(
    X2_train, y2_train,
#     validation_data=(X2_test, y2_test),
    validation_split= 0.2,
    batch_size=512,
    epochs=50,
    callbacks=[early_stopping]
)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();

In [None]:
mse, mae = model.evaluate(X2_test , y2_test)
print( "mean square error is :" , mse)
print( "mean abosulte error is :" , mae)

# **the error when the same model was used but this time using the help of time-series analysis**
> *error fall from   6.349302291870117 to 0.407988 thatt is some tremendous change*

# ADDING MORE TIMESERIES

In [None]:
moving_average = temp_df.rolling(
    window=7,       # 365-day window
    center=True,      # puts the average at the center of the window
    min_periods=2,  # choose about half the window size
).mean()              # compute the mean (could also do median, std, min, max, ...)

ax = temp_df.plot(style=".", color="0.5")
# ax.plotwidth(3)
moving_average.plot(
    ax=ax, linewidth=3, title="Tunnel Traffic - 365-Day Moving Average")#, legend=False,
# );

In [None]:
temp_df.head()

In [None]:
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'


def seasonal_plot(X, y, period, freq, ax=None):
    if ax is None:
        _, ax = plt.subplots()
    palette = sns.color_palette("husl", n_colors=X[period].nunique(),)
    ax = sns.lineplot(
        x=freq,
        y=y,
        hue=period,
        data=X,
        ci=False,
        ax=ax,
        palette=palette,
        legend=False,
    )
    ax.set_title(f"Seasonal Plot ({period}/{freq})")
    for line, name in zip(ax.lines, X[period].unique()):
        y_ = line.get_ydata()[-1]
        ax.annotate(
            name,
            xy=(1, y_),
            xytext=(6, 0),
            color=line.get_color(),
            xycoords=ax.get_yaxis_transform(),
            textcoords="offset points",
            size=14,
            va="center",
        )
    return ax

fig, (ax0, ax1) = plt.subplots(2, 1, figsize=(11, 6))

seasonal_plot(temp_df, y="online_hours", period="week", freq="day", ax=ax0)
seasonal_plot(temp_df, y="online_hours", period="week", freq="dayofweek", ax=ax1)