In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('clustering_spark').getOrCreate()

## Read Hotels Data

In [194]:
hotels_data = spark.read.csv("hotels_data.csv", header = True, inferSchema = True)

## Converts String to Dates 

In [199]:
from pyspark.sql.functions import udf, col
from datetime import datetime
from pyspark.sql.types import DateType, IntegerType

# Converts string to date
def str_to_date(str):    
    return datetime.strptime(str, '%m/%d/%Y %H:%M')

# convert a regular function to pyspark function
udf_strToDate = udf(str_to_date, DateType())

# convert "checking_date" and "snapshot_date" to date types
hotels_data_with_dates = hotels_data.withColumn("checkin_date", udf_strToDate(col("Checkin Date")))
hotels_data_with_dates = hotels_data_with_dates.withColumn("snapshot_date", udf_strToDate(col("Snapshot Date")))

datetime.date(2015, 8, 12)

## Section 6

In [222]:
from pyspark.sql.functions import desc

# group by hotel name and count, take the first 150 hotels with the biggest count 
count_by_hotel_names = hotels_data_with_dates.groupBy('Hotel Name').count().sort(desc('count')).limit(150)

# get a list of the first 150 hotel names 
first_150_hotel_names = count_by_hotel_names.toPandas()['Hotel Name'].tolist()

# filter hotels_data to include records from the 150 hotel names
hotels_150_data = hotels_data_with_dates.filter(col('Hotel Name').isin(first_150_hotel_names))

origin 187848
new 169340


## Section 7

In [227]:
# group by checkin and count, take the first 40 with biggest count
count_by_checkin = hotels_150_data.groupBy('checkin_date').count().sort(desc('count')).limit(40)

# get a list of the most common 40 checkin dates
first_40_checkin = count_by_checkin.toPandas()['checkin_date'].tolist()

# filter hotels data by the 40 most common dates
hotels_by_40_checkin = hotels_150_data.filter(col('checkin_date').isin(first_40_checkin))

## Section 8

In [232]:
unique_hotels_names = hotels_by_40_checkin.select("Hotel Name").distinct()
unique_checkins = hotels_by_40_checkin.select("Checkin Date").distinct()
unique_discount_code = [1, 2, 3, 4]

# creating default data - all combination : checking -hotel - discount code
import sys

combs = []
for x in unique_hotels_names:
    for y in unique_checkins:
        for z in unique_discount_code:
            combs.append([x, y, z, sys.maxsize])
            
new_df = spark.createDataFrame(combs, ["Hotel Name", "Checkin Date", "Discount Code", "Discount Price"])
new_df.head()

[[Column<b'Hotel Name'>, Column<b'Checkin Date'>, 1, 9223372036854775807],
 [Column<b'Hotel Name'>, Column<b'Checkin Date'>, 2, 9223372036854775807],
 [Column<b'Hotel Name'>, Column<b'Checkin Date'>, 3, 9223372036854775807],
 [Column<b'Hotel Name'>, Column<b'Checkin Date'>, 4, 9223372036854775807]]