# Hotel Recommendation System

In [None]:
def get_shape(my_df):
  row_number = my_df.count()
  column_number = len(my_df.dtypes)
  return (row_number, column_number)

In [None]:
destinations_file = "/mnt/hoteldatamount/destinations.csv"
train_file = "/mnt/hoteldatamount/train.csv"
test_file = "/mnt/hoteldatamount/test.csv"

destinations = spark.read.option("header",True).csv(destinations_file)
train = spark.read.option("header",True).csv(train_file)
test = spark.read.option("header",True).csv(test_file)

In [None]:
train.printSchema()

In [None]:
train = train.na.drop() 

In [None]:
datetime = ['date_time', 'srch_ci', 'srch_co']

In [None]:
from pyspark.sql.types import DateType, IntegerType, DatetimeConverter
from pyspark.sql.functions import *

train = train.withColumn("month_date_time", month(train.date_time)).withColumn("year_date_time", year(train.date_time)).withColumn("day_date_time", dayofmonth(train.date_time)).withColumn("minute_date_time", minute(train.date_time)).withColumn("hour_date_time", hour(train.date_time)).withColumn("second_date_time", second(train.date_time))

train = train.withColumn("month_srch_ci", month(train.srch_ci)).withColumn("year_srch_ci", year(train.srch_ci)).withColumn("day_srch_ci", dayofmonth(train.srch_ci))

train = train.withColumn("month_srch_co", month(train.srch_co)).withColumn("year_srch_co", year(train.srch_co)).withColumn("day_srch_co", dayofmonth(train.srch_co))


for col in train.dtypes:
    if col[0] not in datetime:
        train = train.withColumn(col[0], train[col[0]].cast(IntegerType()))
        
train = train.drop(*datetime)

In [None]:
features = [i[0] for i in train.dtypes if i[0] != 'hotel_cluster']
features

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
# transformer
vector_assembler = VectorAssembler(inputCols=features,outputCol="features")
df_temp = vector_assembler.transform(train)
# drop the original data features column
train = df_temp.drop('site_name',
 'posa_continent',
 'user_location_country',
 'user_location_region',
 'user_location_city',
 'orig_destination_distance',
 'user_id',
 'is_mobile',
 'is_package',
 'channel',
 'srch_adults_cnt',
 'srch_children_cnt',
 'srch_rm_cnt',
 'srch_destination_id',
 'srch_destination_type_id',
 'is_booking',
 'cnt',
 'hotel_continent',
 'hotel_country',
 'hotel_market',
 'month_date_time',
 'year_date_time',
 'day_date_time',
 'minute_date_time',
 'hour_date_time',
 'second_date_time',
 'month_srch_ci',
 'year_srch_ci',
 'day_srch_ci',
 'month_srch_co',
 'year_srch_co',
 'day_srch_co')
from pyspark.ml.feature import StringIndexer
# estimator
l_indexer = StringIndexer(inputCol="hotel_cluster", outputCol="labelIndex")
train = l_indexer.fit(train).transform(train)
# data splitting
(training,testing) = train.randomSplit([0.7,0.3])

## Decision Tree Classifier

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
# train our model using training data
dt = DecisionTreeClassifier(labelCol="labelIndex", featuresCol="features")
model = dt.fit(training)
predictions = model.transform(testing)
predictions.select("prediction", "labelIndex").show(5)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
print("Accuracy = %g " % accuracy * 100)

In [None]:
training.display(2)

## Random Forest Classifier

In [None]:
from pyspark.ml.classification import RandomForestClassifier
# train our model using training data
rf = RandomForestClassifier(labelCol="labelIndex",featuresCol="features", numTrees=10)
model = rf.fit(training)
# test our model and make predictions using testing data
predictions = model.transform(testing)
predictions.select("prediction", "labelIndex").show(5)
# evaluate the performance of the classifier
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex",predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print("Accuracy = %g " % accuracy)