In [0]:
from pyspark.sql.functions import *
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
import pmdarima as pm
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
import statsmodels.api as sm
from scipy import stats
from datetime import date, timedelta, datetime
from pyspark.sql.types import  StructType, StructField, StringType, DateType, FloatType,DoubleType 

#### data

In [0]:
#recuper les données à partir la base de données
data= sqlContext.table("users_db.info_token")

#### Les fonctions

In [0]:
# #mise à jour les données pour tenir en compte l'aggregation que j'ai fait initialement 
# def updating_data(data):
#   data=data.groupby("token").agg(min("date_premiere_cap").alias("date_premiere_cap"),
#                                 max("date_derniere_cap").alias("date_derniere_cap"),
#                                 sum("Age_token_mois").alias("Age_token_mois"),
#                                  sum("nombre_trajet").alias("nombre_trajet"),
#                                  sum("nombre_km").alias("nombre_km"),
#                                  sum("nombre_nodes").alias("nombre_nodes"),
#                                  max("temps_inactivite_max_entre_2trajets").alias("temps_inactivite_max_entre_2trajets"),
#                                  sum("temps_inactivite_mois").alias("temps_inactivite_mois"),
#                                  first("dpt").alias("dpt"))

#   return data

#ajouter les colonnes profil_utilisateur,confinement
def add_columns(data):
  #0 actif , 1 inactif 
  data_processing=(data.withColumn("profil_utilisateur",when(col("temps_inactivite_mois")<5,0).otherwise(1))
                      #ajouter la colonne de confinement 
                      .withColumn("confinement",
                                   when(((data.date_premiere_cap>="2020-03-17" )& (data.date_premiere_cap<="2020-05-10" ))
                                  |((data.date_premiere_cap>="2020-10-30" )& (data.date_premiere_cap<="2020-12-15" ))
                                  |((data.date_premiere_cap>="2021-04-03" )& (data.date_premiere_cap<="2021-05-03" )),True)
                                  .otherwise(False))
                       .withColumn("recrutement",
                                  when((data.date_premiere_cap.contains("2019-04"))
                                  |(data.date_premiere_cap.contains("2019-05"))
                                  |(data.date_premiere_cap.contains("2020-07"))
                                  |(data.date_premiere_cap.contains("2020-01"))
                                  |(data.date_premiere_cap.contains("2020-09"))
                                  |(data.date_premiere_cap.contains("2021-09"))
                                  |(data.date_premiere_cap.contains("2022-05"))
                                  |(data.date_premiere_cap.contains("2022-08"))
                                  |(data.date_premiere_cap.contains("2022-09"))
                                  |(data.date_premiere_cap.contains("2022-10")),1)
                                  .otherwise(0))
                  )
  return data_processing

#resample data  suivant le variable "resample"(suivant le jour , semaine, mois ...)
def group_data(df,resample):
  df['date_premiere_cap'] = pd.to_datetime(df['date_premiere_cap'])
  df=df.set_index("date_premiere_cap")
  daily_data=df.resample(resample).agg({'nombre_trajet':'sum',
                                          #'nombre_device':'sum',
                                          'nombre_km':'sum',
                                          'temps_inactivite_max_entre_2trajets':'sum',
                                          'Age_token_mois':'sum',
                                          'temps_inactivite_mois':'sum',
                                          'token':'count',
                                          'profil_utilisateur':'sum',
                                           'confinement':'sum',
                                      "recrutement":"sum"
                                       }
                                        )
  return daily_data

#ajouter la tendance pour le nombre de token si il va augmenter ou non , recrutement
def add_token_tendance(df):
  df["token_tend"] = df["token"].diff()
  df['token_tendance'] = np.where(df["token_tend"]<0, 0, 1)
  return df

def round_data(data):
  data.Age_token_mois = np.round(data.Age_token_mois,decimals = 2) 
  data.nombre_km = np.round(data.nombre_km,decimals = 2) 
  return data 

In [0]:
#data_update=updating_data(data)
data_add_columns=add_columns(data)

In [0]:
#data_update=updating_data(data)
data_add_columns=add_columns(data)
#to pandas 
df_pandas= data_add_columns.toPandas()
daily_data=group_data(df_pandas,resample="D")

#### add columns

In [0]:
data_token_tendance=add_token_tendance(daily_data)
#s'il y a la confinement ou non (0 ou 1 )
data_token_tendance['confinement'] = np.where(data_token_tendance['confinement']!=0, 1, 0)
#s'il y a un recrutement ou non (0 ou 1 )
data_token_tendance['recrutement'] = np.where(data_token_tendance['recrutement']!=0, 1, 0)

#### data model

In [0]:
data_round= round_data(daily_data)
data_model=data_round.drop(["token","token_tend"],axis=1)
data_model=data_model.reset_index()


#### training data

In [0]:
#calculer la derniere date
dernier_date=data_model.date_premiere_cap.max()
# Calculer la date  de dernier mois moins 2 ans + 30 jrs
two_years_ago = dernier_date - timedelta(days=365*2+30)

In [0]:
train = data_model[data_model.date_premiere_cap>=two_years_ago]

In [0]:
train=train.set_index("date_premiere_cap")

In [0]:
#train=data_model.set_index("date_premiere_cap")

In [0]:
scaler = RobustScaler()
X_train = scaler.fit_transform(train.drop('profil_utilisateur',axis=1))
y_train = scaler.fit_transform(train["profil_utilisateur"].values.reshape(-1, 1)).reshape(-1, )


In [0]:
model = pm.auto_arima(y_train,
                    start_p=0, start_q=0,
                    #test='adf',       # use adftest to find optimal 'd'
                    max_p=5, max_q=5, # maximum p and q
                    max_d=5,
                    seasonal=True,
                    trace=True,
                    error_action='ignore',  
                    suppress_warnings=True, 
                    stepwise=True)

#### Predictions

In [0]:
model_fit = model.fit(y_train)
X = train.drop("profil_utilisateur",axis=1)

In [0]:
prediction_tomorrow = model_fit.predict(n_periods=1)
# Print the prediction
print("Predicted Value for the Next Day: ", prediction_tomorrow[0].round())

In [0]:
prediction_next_week = model_fit.predict(n_periods=7)
print("Predicted Value for the Next week: ", prediction_next_week.sum())

In [0]:
prediction_next_month = model_fit.predict(n_periods=30)
print("Predicted Value for the Next month: ", prediction_next_month.sum())

In [0]:
prediction_next_5_month = model_fit.predict(n_periods=30*5)
print("Predicted Value for the Next  5 Months: ", prediction_next_5_month.sum())

In [0]:
prediction_next_7_month = model_fit.predict(n_periods=30*7)
print("Predicted Value for the Next 6 Months: ", prediction_next_7_month.sum())

In [0]:
# prediction_next_month = model_fit.predict(n_periods=30, exogenous=np.repeat(X.iloc[-1,:].values,30).reshape(30, -1))
# print("Predicted Value for the Next month with exog data: ", prediction_next_month.sum())


#### Create dataFrame

In [0]:
schema = StructType([
  StructField("Date", DateType(), True), 
])

tmp = [{'Date': date.today(),}]
df_tmp=spark.createDataFrame(tmp,schema=schema)

Date
2023-02-08


In [0]:
#onvertir la valeur de type numpy en float standard en Python
value1 = float(np.float64(prediction_tomorrow[0]))
value7 = float(np.float64(prediction_next_week.sum()))
value30 = float(np.float64(prediction_next_month.sum()))
value_5_month=float(np.float64(prediction_next_5_month.sum()))
value_7_month=float(np.float64(prediction_next_7_month.sum()))

In [0]:
df_tmp=(df_tmp.withColumn('prediction_tomorrow' , round(lit(value1),2))
              .withColumn('prediction_Next_week' , round(lit(value7),2))
              .withColumn('prediction_Next_week' , round(lit(value7),2))
              .withColumn('prediction_Next_month' , round(lit(value30),2))
              .withColumn('prediction_Next_5_month' , round(lit(value_5_month),2))
              .withColumn('prediction_Next_7_month' , round(lit(value_7_month),2))

              
         )
                         

In [0]:
df_tmp.write.format("delta").mode("append").saveAsTable("users_db.prediction_token")