In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('clustering_spark').getOrCreate()

## Read Hotels Data

In [6]:
hotels_data = spark.read.csv("hotels_data.csv", header = True, inferSchema = True)

## Converts String to Dates 

In [7]:
from pyspark.sql.functions import udf, col
from datetime import datetime
from pyspark.sql.types import DateType, IntegerType

# Converts string to date
def str_to_date(str):    
    return datetime.strptime(str, '%m/%d/%Y %H:%M')

# convert a regular function to pyspark function
udf_strToDate = udf(str_to_date, DateType())

# convert "checking_date" and "snapshot_date" to date types
hotels_data_with_dates = hotels_data.withColumn("checkin_date", udf_strToDate(col("Checkin Date")))
hotels_data_with_dates = hotels_data_with_dates.withColumn("snapshot_date", udf_strToDate(col("Snapshot Date")))

## Section 6

In [9]:
from pyspark.sql.functions import desc

# group by hotel name and count, take the first 150 hotels with the biggest count 
count_by_hotel_names = hotels_data_with_dates.groupBy('Hotel Name').count().sort(desc('count')).limit(150)

# get a list of the first 150 hotel names 
first_150_hotel_names = count_by_hotel_names.toPandas()['Hotel Name'].tolist()

# filter hotels_data to include records from the 150 hotel names
hotels_150_data = hotels_data_with_dates.filter(col('Hotel Name').isin(first_150_hotel_names))

## Section 7

In [10]:
# group by checkin and count, take the first 40 with biggest count
count_by_checkin = hotels_150_data.groupBy('checkin_date').count().sort(desc('count')).limit(40)

# get a list of the most common 40 checkin dates
first_40_checkin = count_by_checkin.toPandas()['checkin_date'].tolist()

# filter hotels data by the 40 most common dates
hotels_by_40_checkin = hotels_150_data.filter(col('checkin_date').isin(first_40_checkin))

## Section 8

In [31]:
#creating unique list for Hotel Name - Checkin Date - Discount code dummy combination generating 
unique_hotels_names = hotels_by_40_checkin.select('Hotel Name').distinct().collect()
unique_hotels_names_list = [(row['Hotel Name']) for row in unique_hotels_names]

unique_checkins =  hotels_by_40_checkin.select("checkin_date").distinct().collect()
unique_checkins_list = [(row['checkin_date']) for row in unique_checkins]

unique_discount_code =  [1,2,3,4]

# Create records with max price for each discount code for each date
synth_data = []
import sys
for x in unique_hotels_names_list:
    for y in unique_checkins_list:
        for z in unique_discount_code:
            synth_data.append([x, y ,z, sys.maxsize])

#Making the schema of synth_data
from pyspark.sql.types import *
cSchema = StructType([StructField("Hotel Name", StringType()),StructField("checkin_date", DateType()),StructField("Discount Code",  IntegerType()),StructField("min(Discount Price)", LongType())])

#Creating dummy df
dummy_df = spark.createDataFrame(synth_data, schema=cSchema)

sliced_df = hotels_by_40_checkin.select('Hotel Name', 'checkin_date','Discount Code', 'Discount Price')

# joining dummy data with grouped data 
hotel_chekin_discountCode = sliced_df.union(dummy_df)

# group by Checkin - Hotel - Discount Code
hotel_chekin_discountCode = hotel_chekin_discountCode.groupBy('Hotel name','checkin_date', 'Discount Code').min('Discount Price')

#replacing sys.max with -1 
# hotel_chekin_discountCode = hotel_chekin_discountCode.replace(sys.maxsize, -1)

#sorting date
# hotel_chekin_discountCode = hotel_chekin_discountCode.orderBy(['Hotel name','checkin_date','Discount Code'])




In [42]:
hotel_chekin_discountCode_ToMinus1 = hotel_chekin_discountCode.replace(sys.maxsize, -1)
#Partition data by hotel name
hotelsPartitioned =  hotel_chekin_discountCode.repartition("Hotel name")




+--------------------+------------+-------------+-------------------+
|          Hotel name|checkin_date|Discount Code|min(Discount Price)|
+--------------------+------------+-------------+-------------------+
|Super 8 Brooklyn ...|  2015-10-30|            4|                 -1|
|Four Seasons Hote...|  2015-11-18|            1|                 -1|
|Dumont NYC-an Aff...|  2015-11-28|            2|                 -1|
|Dumont NYC-an Aff...|  2015-08-12|            3|                 -1|
|Hampton Inn Manha...|  2015-09-11|            4|                 -1|
|Roxy Hotel Tribec...|  2015-08-12|            3|                 -1|
| The Kitano New York|  2015-11-06|            2|                 -1|
|Courtyard by Marr...|  2015-11-28|            4|                 -1|
|Hilton Garden Inn...|  2015-08-27|            2|                 -1|
|Baccarat Hotel an...|  2015-10-28|            1|                 -1|
|Baccarat Hotel an...|  2015-10-21|            4|                 -1|
|Baccarat Hotel an..

## Normalizing data


In [12]:
#split to two groups: one with price of -1, one with greater than -1
groupMinus = hotel_chekin_discountCode.filter(col('min(Discount Price)') == -1)
groupGreater = hotel_chekin_discountCode.filter(col('min(Discount Price)') > -1)

In [47]:
from pyspark.sql import SQLContext
from pyspark.sql.window import Window
import pyspark.sql.functions as func

groupGreater.createOrReplaceTempView("normalization")

#Creating SQLContext for SQL converting
sqlContext = SQLContext(spark)
    
#Preparing data for normalization
dataFrame = sqlContext.table("normalization")
#Partioning data for groups so we can applay function on groups
windowSpec = Window.partitionBy(groupGreater['Hotel name'])  


#Normalize function
normalize =  ((dataFrame['min(Discount Price)'] - func.min(dataFrame['min(Discount Price)']).over(windowSpec)) / (func.max(dataFrame['min(Discount Price)']).over(windowSpec) - func.min(dataFrame['min(Discount Price)']).over(windowSpec)) * 100)

normalized_df = dataFrame.select(
  dataFrame['Hotel Name'],
  dataFrame['checkin_date'],
  dataFrame['Discount Code'],
  normalize.alias("Normal"))


+------------+------------+-------------+------+
|  Hotel Name|checkin_date|Discount Code|Normal|
+------------+------------+-------------+------+
|Aloft Harlem|  2015-08-12|            1| 100.0|
|Aloft Harlem|  2015-08-12|            2| 100.0|
|Aloft Harlem|  2015-08-12|            3| 100.0|
|Aloft Harlem|  2015-08-12|            4| 100.0|
|Aloft Harlem|  2015-08-13|            1| 100.0|
|Aloft Harlem|  2015-08-13|            2| 100.0|
|Aloft Harlem|  2015-08-13|            3| 100.0|
|Aloft Harlem|  2015-08-13|            4| 100.0|
|Aloft Harlem|  2015-08-19|            1| 100.0|
|Aloft Harlem|  2015-08-19|            2| 100.0|
|Aloft Harlem|  2015-08-19|            3| 100.0|
|Aloft Harlem|  2015-08-19|            4| 100.0|
|Aloft Harlem|  2015-08-26|            1| 100.0|
|Aloft Harlem|  2015-08-26|            2| 100.0|
|Aloft Harlem|  2015-08-26|            3| 100.0|
|Aloft Harlem|  2015-08-26|            4| 100.0|
|Aloft Harlem|  2015-08-27|            1| 100.0|
|Aloft Harlem|  2015

### Joining back -1 values

In [48]:

#Changing column name for same schema
groupMinus = groupMinus.withColumnRenamed('min(Discount Price)','Normal')

#Union all data frames with sorting
normalized_df = normalized_df.union(groupMinus)
normalized_df = normalized_df.orderBy(['Hotel name','checkin_date','Discount Code'])


In [95]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

normal_toList = normalized_df.groupBy('Hotel name').agg(F.collect_list('Normal').alias("Normal"))
#normal_toList.select("Normal").flatMap(lambda x : x)

#parsing to df 
all_hotels_df =  normal_toList.select([normal_toList["Hotel name"]] +  [normal_toList.Normal[i] for i in range(160)])

In [97]:
from pyspark.ml.clustering import BisectingKMeans
dataset = spark.read.format("libsvm").load(all_hotels_df)

AttributeError: 'DataFrame' object has no attribute '_get_object_id'

In [103]:
l = [('Alice', 1)]
df = spark.createDataFrame(l, ['name', 'features']).collect()


In [104]:
# Trains a bisecting k-means model.
from pyspark.ml.clustering import BisectingKMeans
bkm = BisectingKMeans().setK(2).setSeed(1)


model = bkm.fit(df)

# Evaluate clustering.
cost = model.computeCost(df)
print("Within Set Sum of Squared Errors = " + str(cost))

# Shows the result.
print("Cluster Centers: ")
centers = model.clusterCenters()
for center in centers:
    print(center)

AttributeError: 'list' object has no attribute '_jdf'

In [None]:
from numpy import array

from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel
parsedData = normal_toList.select("Normal").rdd.map(lambda line: array([float(x) for x in line.split(' ')]))

model = BisectingKMeans.train(parsedData, 2, maxIterations=5)
cost = model.computeCost(parsedData)
print("Bisecting K-means Cost = " + str(cost))

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------