# Training

In [1]:
from pyspark.sql import SparkSession

    # build our own SparkSession
spark = (SparkSession
        .builder
        .appName("BigData")
        .config("spark.sql.shuffle.partitions",6)
        .config("spark.sql.repl.eagereval.enabled",True)
        .getOrCreate()
        )

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/07 11:01:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/07 11:01:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/04/07 11:01:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [33]:
df_hotels = spark.read.parquet('small-hotels')

In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import os
import sys

import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt
import seaborn as sns
# from ydata_profiling import ProfileReport
import warnings
warnings.filterwarnings("ignore")

In [6]:
df_hotels.show(1, vertical=True)

-RECORD 0---------------------------------------
 date_time                | 2014-07-14 17:37:39 
 site_name                | 2                   
 posa_continent           | 3                   
 user_location_country    | 19                  
 user_location_region     | 58                  
 user_location_city       | 1454                
 user_id                  | 336350              
 is_mobile                | 0                   
 is_package               | 0                   
 channel                  | 9                   
 srch_ci                  | 2014-08-11 00:00:00 
 srch_co                  | 2014-08-16 00:00:00 
 srch_adults_cnt          | 2                   
 srch_children_cnt        | 4                   
 srch_rm_cnt              | 2                   
 srch_destination_id      | 8279                
 srch_destination_type_id | 1                   
 is_booking               | 0                   
 cnt                      | 2                   
 hotel_continent    

In [28]:
df_hotel = df_hotels.select("num_nights", "cnt", "srch_rm_cnt", "srch_adults_cnt", "srch_children_cnt", "channel", "is_mobile", "is_booking", "is_package", "user_location_country", "user_location_city", "user_location_region", "Id_hotel", "user_id")

# Encoding

In [29]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# Creating StringIndexer objects for each categorical variable
indexer_country = StringIndexer(inputCol="user_location_country", outputCol="country_index")
indexer_city = StringIndexer(inputCol="user_location_city", outputCol="city_index")
indexer_region = StringIndexer(inputCol="user_location_region", outputCol="region_index")
indexer_channel = StringIndexer(inputCol="channel", outputCol="channel_index")

# Fitting the StringIndexer objects on the data
indexer_model_country = indexer_country.fit(df_hotel)
indexer_model_city = indexer_city.fit(df_hotel)
indexer_model_region = indexer_region.fit(df_hotel)
indexer_model_channel = indexer_channel.fit(df_hotel)

# Transforming the data using the StringIndexer objects
df_hotel_indexed = indexer_model_country.transform(df_hotel)
df_hotel_indexed = indexer_model_city.transform(df_hotel_indexed)
df_hotel_indexed = indexer_model_region.transform(df_hotel_indexed)
df_hotel_indexed = indexer_model_channel.transform(df_hotel_indexed)

# Creating OneHotEncoder objects for each categorical variable
encoder_country = OneHotEncoder(inputCol="country_index", outputCol="country_encoded")
encoder_city = OneHotEncoder(inputCol="city_index", outputCol="city_encoded")
encoder_region = OneHotEncoder(inputCol="region_index", outputCol="region_encoded")
encoder_channel = OneHotEncoder(inputCol="channel_index", outputCol="channel_encoded")

# Fitting the OneHotEncoder objects on the data
encoder_model_country = encoder_country.fit(df_hotel_indexed)
encoder_model_city = encoder_city.fit(df_hotel_indexed)
encoder_model_region = encoder_region.fit(df_hotel_indexed)
encoder_model_channel = encoder_channel.fit(df_hotel_indexed)

# Transforming the data using the OneHotEncoder objects
df_hotel_encoded = encoder_model_country.transform(df_hotel_indexed)
df_hotel_encoded = encoder_model_city.transform(df_hotel_encoded)
df_hotel_encoded = encoder_model_region.transform(df_hotel_encoded)
df_hotel_encoded = encoder_model_channel.transform(df_hotel_encoded)

# Displaying the encoded data
df_hotel_encoded.select("user_location_country", "country_encoded", "user_location_city", "city_encoded", "user_location_region", "region_encoded", "channel", "channel_encoded").show()

+---------------------+----------------+------------------+--------------------+--------------------+-----------------+-------+---------------+
|user_location_country| country_encoded|user_location_city|        city_encoded|user_location_region|   region_encoded|channel|channel_encoded|
+---------------------+----------------+------------------+--------------------+--------------------+-----------------+-------+---------------+
|                   19|(217,[43],[1.0])|              1454|(18793,[5945],[1.0])|                  58|(858,[153],[1.0])|      9| (10,[0],[1.0])|
|                   66| (217,[0],[1.0])|             38899| (18793,[599],[1.0])|                 174|  (858,[0],[1.0])|      9| (10,[0],[1.0])|
|                   66| (217,[0],[1.0])|             31674| (18793,[365],[1.0])|                 442|  (858,[3],[1.0])|      0| (10,[1],[1.0])|
|                   66| (217,[0],[1.0])|             35390|   (18793,[8],[1.0])|                 442|  (858,[3],[1.0])|      9| (10,[0],

# Standard Scaler

In [30]:
from pyspark.ml.feature import StandardScaler, VectorAssembler

# Creating a VectorAssembler to assemble the features into a single vector
vectorAssembler = VectorAssembler(inputCols=["num_nights", "cnt", "srch_rm_cnt", "srch_adults_cnt", "srch_children_cnt"], outputCol="features")
df_hotel = vectorAssembler.transform(df_hotel)

# Creating a StandardScaler object to scale the features
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

# Fitting the StandardScaler on the data
scaler_model = scaler.fit(df_hotel)

# Transforming the data using the StandardScaler
df_hotel_scaled = scaler_model.transform(df_hotel)

# Displaying the scaled data
df_hotel_scaled.select("features", "scaled_features").show()

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|[5.0,2.0,2.0,2.0,...|[1.58748856634699...|
|[8.0,1.0,1.0,2.0,...|[2.53998170615518...|
|[2.0,1.0,1.0,2.0,...|[0.63499542653879...|
|[4.0,1.0,1.0,2.0,...|[1.26999085307759...|
|[1.0,1.0,1.0,2.0,...|[0.31749771326939...|
|[2.0,1.0,1.0,2.0,...|[0.63499542653879...|
|[8.0,1.0,1.0,2.0,...|[2.53998170615518...|
|[1.0,1.0,4.0,4.0,...|[0.31749771326939...|
|[1.0,2.0,1.0,2.0,...|[0.31749771326939...|
|[4.0,1.0,1.0,1.0,...|[1.26999085307759...|
|[4.0,2.0,1.0,1.0,...|[1.26999085307759...|
|[3.0,1.0,1.0,1.0,...|[0.95249313980819...|
|[2.0,1.0,1.0,1.0,...|[0.63499542653879...|
|[2.0,1.0,1.0,1.0,...|[0.63499542653879...|
|[1.0,1.0,1.0,1.0,...|[0.31749771326939...|
|[5.0,1.0,2.0,2.0,...|[1.58748856634699...|
|[5.0,1.0,1.0,2.0,...|[1.58748856634699...|
|[4.0,4.0,1.0,3.0,...|[1.26999085307759...|
|[4.0,1.0,1.0,1.0,...|[1.26999085307759...|
|[4.0,1.0,1.0,1.0,...|[1.2699908

## Doing Everything

In [35]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler

# Creating StringIndexer objects for each categorical variable
indexer_country = StringIndexer(inputCol="user_location_country", outputCol="country_index")
indexer_city = StringIndexer(inputCol="user_location_city", outputCol="city_index")
indexer_region = StringIndexer(inputCol="user_location_region", outputCol="region_index")
indexer_channel = StringIndexer(inputCol="channel", outputCol="channel_index")

# Creating OneHotEncoder objects for each categorical variable
encoder_country = OneHotEncoder(inputCol="country_index", outputCol="country_encoded")
encoder_city = OneHotEncoder(inputCol="city_index", outputCol="city_encoded")
encoder_region = OneHotEncoder(inputCol="region_index", outputCol="region_encoded")
encoder_channel = OneHotEncoder(inputCol="channel_index", outputCol="channel_encoded")

# Creating a VectorAssembler to assemble the features into a single vector
vectorAssembler = VectorAssembler(inputCols=["num_nights", "cnt", "srch_rm_cnt", "srch_adults_cnt", "srch_children_cnt"], outputCol="assembled_features")
df_hotel_assembled = vectorAssembler.transform(df_hotel_encoded)

# Creating a StandardScaler object to scale the features
scaler = StandardScaler(inputCol="assembled_features", outputCol="scaled_features")

# Fitting the StandardScaler on the data
scaler_model = scaler.fit(df_hotel_assembled)

# Transforming the data using the StandardScaler
df_hotel_scaled = scaler_model.transform(df_hotel_assembled)

# Displaying the scaled data
df_hotel_scaled.select("assembled_features", "scaled_features").show()

+--------------------+--------------------+
|  assembled_features|     scaled_features|
+--------------------+--------------------+
|[5.0,2.0,2.0,2.0,...|[1.58748856634699...|
|[8.0,1.0,1.0,2.0,...|[2.53998170615518...|
|[2.0,1.0,1.0,2.0,...|[0.63499542653879...|
|[4.0,1.0,1.0,2.0,...|[1.26999085307759...|
|[1.0,1.0,1.0,2.0,...|[0.31749771326939...|
|[2.0,1.0,1.0,2.0,...|[0.63499542653879...|
|[8.0,1.0,1.0,2.0,...|[2.53998170615518...|
|[1.0,1.0,4.0,4.0,...|[0.31749771326939...|
|[1.0,2.0,1.0,2.0,...|[0.31749771326939...|
|[4.0,1.0,1.0,1.0,...|[1.26999085307759...|
|[4.0,2.0,1.0,1.0,...|[1.26999085307759...|
|[3.0,1.0,1.0,1.0,...|[0.95249313980819...|
|[2.0,1.0,1.0,1.0,...|[0.63499542653879...|
|[2.0,1.0,1.0,1.0,...|[0.63499542653879...|
|[1.0,1.0,1.0,1.0,...|[0.31749771326939...|
|[5.0,1.0,2.0,2.0,...|[1.58748856634699...|
|[5.0,1.0,1.0,2.0,...|[1.58748856634699...|
|[4.0,4.0,1.0,3.0,...|[1.26999085307759...|
|[4.0,1.0,1.0,1.0,...|[1.26999085307759...|
|[4.0,1.0,1.0,1.0,...|[1.2699908

# Training 

In [40]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.recommendation import ALS

# Criando um vetor com os atributos que serão usados para fazer as recomendações
vectorAssembler = VectorAssembler(inputCols=["num_nights", "cnt", "srch_rm_cnt", "srch_adults_cnt", "srch_children_cnt", "channel", "is_mobile", "is_booking", "is_package", "user_location_country", "user_location_city", "user_location_region", "scaled_features"], outputCol="features")
df_hotels = vectorAssembler.transform(df_hotel_scaled)

# Separando os dados em conjuntos de treinamento e teste
#(training, test) = df_hotels.randomSplit([0.8, 0.2])

training = df_hotels.sampleBy("is_booking", fractions={0: 0.8, 1: 0.8}, seed=42)
test = df_hotels.subtract(training)


# Configurando o modelo ALS (Alternating Least Squares) para fazer as recomendações
als = ALS(userCol="user_id", itemCol="Id_hotel", ratingCol="is_booking", coldStartStrategy="drop")
model = als.fit(training)

recommendations = model.recommendForAllUsers(10)

In [46]:
recommendations.filter(recommendations.user_id == 336709).show(truncate=False)

+-------+--------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                               |
+-------+--------------------------------------------------------------------------------------------------------------+
|336709 |[{0, 0.0}, {10, 0.0}, {20, 0.0}, {30, 0.0}, {40, 0.0}, {50, 0.0}, {60, 0.0}, {-319, 0.0}, {1, 0.0}, {11, 0.0}]|
+-------+--------------------------------------------------------------------------------------------------------------+



                                                                                

In [59]:
df_hotels.show(2,vertical=True)

-RECORD 0-------------------------------------
 num_nights            | 5                    
 cnt                   | 2                    
 srch_rm_cnt           | 2                    
 srch_adults_cnt       | 2                    
 srch_children_cnt     | 4                    
 channel               | 9                    
 is_mobile             | 0                    
 is_booking            | 0                    
 is_package            | 0                    
 user_location_country | 19                   
 user_location_city    | 1454                 
 user_location_region  | 58                   
 Id_hotel              | 5                    
 user_id               | 336350               
 country_index         | 43.0                 
 city_index            | 5945.0               
 region_index          | 153.0                
 channel_index         | 0.0                  
 country_encoded       | (217,[43],[1.0])     
 city_encoded          | (18793,[5945],[1.0]) 
 region_encod

----------------------------

# Interpretando Output

# Using the model with new user

In [47]:
from pyspark.sql import Row

# criar um novo DataFrame para um novo usuário
new_user = Row(user_id=999, num_nights=3, cnt=2, srch_rm_cnt=1, srch_adults_cnt=2, srch_children_cnt=0,
               channel=1, is_mobile=0, is_booking=0, is_package=1, user_location_country=50, user_location_city=1234,
               user_location_region=123)

new_user_df = spark.createDataFrame([new_user])

# gerar recomendações para o novo usuário
new_user_recs = model.recommendForUserSubset(new_user_df, 10)
new_user_recs.show()

+-------+---------------+
|user_id|recommendations|
+-------+---------------+
+-------+---------------+



# Validar o modelo

## Accuracy 

In [52]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col


predictions = model.transform(test)
predictions = predictions.withColumn("prediction", col("prediction").cast("double"))
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="is_booking", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy: {:.2%}".format(accuracy))


23/04/07 12:33:00 WARN DAGScheduler: Broadcasting large task binary with size 1586.6 KiB
23/04/07 12:33:00 WARN DAGScheduler: Broadcasting large task binary with size 1608.1 KiB


                                                                                

23/04/07 12:33:02 WARN DAGScheduler: Broadcasting large task binary with size 1781.2 KiB


[Stage 696:>                                                        (0 + 6) / 6]

23/04/07 12:33:04 WARN DAGScheduler: Broadcasting large task binary with size 1657.6 KiB
23/04/07 12:33:04 WARN DAGScheduler: Broadcasting large task binary with size 1722.2 KiB
Accuracy: 81.62%


                                                                                

## Precision Recall F1

In [54]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Avalia as previsões do modelo nos dados de teste
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="is_booking")
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
f1_score = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

23/04/07 12:35:28 WARN DAGScheduler: Broadcasting large task binary with size 1586.6 KiB
23/04/07 12:35:28 WARN DAGScheduler: Broadcasting large task binary with size 1608.1 KiB


                                                                                

23/04/07 12:35:30 WARN DAGScheduler: Broadcasting large task binary with size 1781.2 KiB




23/04/07 12:35:32 WARN DAGScheduler: Broadcasting large task binary with size 1657.6 KiB
23/04/07 12:35:32 WARN DAGScheduler: Broadcasting large task binary with size 1722.2 KiB


                                                                                

23/04/07 12:35:32 WARN DAGScheduler: Broadcasting large task binary with size 1586.6 KiB
23/04/07 12:35:32 WARN DAGScheduler: Broadcasting large task binary with size 1608.1 KiB


                                                                                

23/04/07 12:35:33 WARN DAGScheduler: Broadcasting large task binary with size 1781.2 KiB


[Stage 872:>                                                        (0 + 6) / 6]

23/04/07 12:35:35 WARN DAGScheduler: Broadcasting large task binary with size 1657.6 KiB
23/04/07 12:35:35 WARN DAGScheduler: Broadcasting large task binary with size 1722.2 KiB


                                                                                

23/04/07 12:35:35 WARN DAGScheduler: Broadcasting large task binary with size 1586.6 KiB
23/04/07 12:35:35 WARN DAGScheduler: Broadcasting large task binary with size 1608.1 KiB
23/04/07 12:35:36 WARN DAGScheduler: Broadcasting large task binary with size 1781.2 KiB


[Stage 960:>                                                        (0 + 6) / 6]

23/04/07 12:35:37 WARN DAGScheduler: Broadcasting large task binary with size 1657.6 KiB
23/04/07 12:35:38 WARN DAGScheduler: Broadcasting large task binary with size 1722.2 KiB


                                                                                

23/04/07 12:35:38 WARN DAGScheduler: Broadcasting large task binary with size 1586.6 KiB
23/04/07 12:35:38 WARN DAGScheduler: Broadcasting large task binary with size 1608.1 KiB


                                                                                

23/04/07 12:35:39 WARN DAGScheduler: Broadcasting large task binary with size 1781.2 KiB


[Stage 1048:>                                                       (0 + 6) / 6]

23/04/07 12:35:40 WARN DAGScheduler: Broadcasting large task binary with size 1657.6 KiB
23/04/07 12:35:40 WARN DAGScheduler: Broadcasting large task binary with size 1722.2 KiB


                                                                                

In [56]:
print("Precision: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("f1_score", f1_score)

Precision:  0.8161745233050848
Precision:  0.85947772305961
Recall:  0.8161745233050848
f1_score 0.8372665896893304


In [17]:
# Importando as bibliotecas necessárias
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.recommendation import ALS


# Criando um vetor com os atributos que serão usados para fazer as recomendações
vectorAssembler = VectorAssembler(inputCols=["num_nights", "cnt", "srch_rm_cnt", "srch_adults_cnt", "srch_children_cnt", "channel","is_mobile", "is_booking", "is_package", "user_location_country", "user_location_city", "user_location_region"], outputCol="features")
df_hotels = vectorAssembler.transform(df_hotel)

In [10]:
# Separando os dados em conjuntos de treinamento e teste
(training, test) = df_hotels.randomSplit([0.8, 0.2])


# Configurando o modelo ALS (Alternating Least Squares) para fazer as recomendações
als = ALS(userCol="user_id", itemCol="Id_hotel", ratingCol="is_booking", coldStartStrategy="drop")
model = als.fit(training)

                                                                                

23/04/07 11:11:54 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/04/07 11:11:54 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/04/07 11:11:54 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [11]:
recommendations = model.recommendForAllUsers(10) #Não usem esta

In [15]:
recommendations.filter(recommendations.user_id == 336350).show(truncate=False)



+-------+----------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                 |
+-------+----------------------------------------------------------------------------------------------------------------+
|336350 |[{-260, 0.0}, {0, 0.0}, {10, 0.0}, {20, 0.0}, {30, 0.0}, {40, 0.0}, {50, 0.0}, {60, 0.0}, {-309, 0.0}, {1, 0.0}]|
+-------+----------------------------------------------------------------------------------------------------------------+



                                                                                

In [14]:
df_hotels.show(5,vertical=True)

-RECORD 0-------------------------------------
 num_nights            | 5                    
 cnt                   | 2                    
 srch_rm_cnt           | 2                    
 srch_adults_cnt       | 2                    
 srch_children_cnt     | 4                    
 channel               | 9                    
 is_mobile             | 0                    
 is_booking            | 0                    
 is_package            | 0                    
 user_location_country | 19                   
 user_location_city    | 1454                 
 Id_hotel              | 5                    
 user_location_region  | 58                   
 user_id               | 336350               
 features              | [5.0,2.0,2.0,2.0,... 
-RECORD 1-------------------------------------
 num_nights            | 8                    
 cnt                   | 1                    
 srch_rm_cnt           | 1                    
 srch_adults_cnt       | 2                    
 srch_childre

                                                                                

- Como Normailzar os Dados
- Como validar o modelo, calcular performance
- Como ter a lista de hoteis recomendados
- Tirar cnt 