# Connect to Hive

In [1]:
from pyspark.sql import SparkSession

# Add here your team number teamx
team = "team23"

# location of your Hive database in HDFS
warehouse = "project/hive/warehouse"

spark = SparkSession.builder\
        .appName("{} - spark ML".format(team))\
        .master("yarn")\
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", warehouse)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .enableHiveSupport()\
        .getOrCreate()

sc = spark.sparkContext

In [2]:
spark

# list Hive databases

In [2]:
print(spark.catalog.listDatabases())
spark.sql("SHOW DATABASES;").show()

[Database(name='default', description='Default Hive database', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/apps/hive/warehouse'), Database(name='root_db', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/root/root_db'), Database(name='team0_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team0/project/hive/warehouse'), Database(name='team12_hive_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team12/project/hive/warehouse'), Database(name='team13_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team13/project/hive/warehouse'), Database(name='team14_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team14/project/hive/warehouse'), Database(name='team15_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team15/project/hive/warehouse'), Database(name='team16_projectdb', description

In [3]:
spark.sql("USE team23_projectdb;")
spark.sql("SHOW TABLES;").show()

+----------------+----------------+-----------+
|       namespace|       tableName|isTemporary|
+----------------+----------------+-----------+
|team23_projectdb|airbnb_part_buck|      false|
|team23_projectdb|      q1_results|      false|
|team23_projectdb|      q2_results|      false|
|team23_projectdb|      q3_results|      false|
|team23_projectdb|      q4_results|      false|
|team23_projectdb|      q5_results|      false|
+----------------+----------------+-----------+



# Specify the input and output features

In [4]:
# We will use the following features
# Excluded 'thumbnail_url' and 'id' because it has no valuable information to extract
# Exclude? host-related attributes since they contain little information about listing itself
# Excluded 'first_review', 'host_response_rate', 'last_review' 'review_scores_rating'  because of large amount of Null values
features = ['property_type', 'room_type', 'amenities', 'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',\
            'cleaning_fee', 'city', 'description', 'host_has_profile_pic', 'host_identity_verified','host_since',\
            'instant_bookable', 'latitude', 'longitude', 'name', 'neighbourhood', 'number_of_reviews', 'zipcode', 'beds', 'bedrooms']

# The output/target of our model
label = 'log_price'

# Read hive tables

In [5]:
# make display fancy
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [6]:
airbnb = spark.read.format("avro").table('team23_projectdb.airbnb_part_buck')
airbnb

id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,description,first_review,host_has_profile_pic,host_identity_verified,host_response_rate,host_since,instant_bookable,last_review,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,beds,bedrooms
17761581,5.159055299214528,Apartment,Entire home/apt,"{""Cable TV"",""Wire...",6,1.0,Real Bed,strict,False,Boston,Located in the he...,2016-01-01,True,False,54.0,2015-12-18,False,2017-07-25,42.35481432026883,-71.0746523547689,New Beacon Back B...,Back Bay,35,91.0,https://a0.muscac...,2116.0,2.0,2
11106074,5.1647859739235145,Apartment,Entire home/apt,"{Internet,""Wirele...",5,1.0,Real Bed,strict,True,NYC,My place is close...,2017-04-30,True,True,90.0,2012-09-22,False,2017-09-11,40.68727263224503,-73.985738203789,Stylish centrally...,Boerum Hill,16,93.0,https://a0.muscac...,11201.0,2.0,2
17173284,5.1929568508902095,Apartment,Entire home/apt,"{Internet,""Wirele...",4,1.0,Real Bed,moderate,False,LA,two bedrooms with...,2013-03-15,True,False,100.0,2013-01-29,False,2017-04-15,34.09230262454207,-118.2722475650489,Amazing View in S...,Silver Lake,84,97.0,https://a0.muscac...,90026.0,2.0,2
7294049,5.703782474656202,Townhouse,Entire home/apt,"{""Wireless Intern...",7,1.0,Real Bed,strict,True,NYC,Clean and simple ...,,True,True,100.0,2012-12-13,False,,40.74020108385536,-73.99899971053048,Clean + Simple Ch...,Chelsea,0,,https://a0.muscac...,10011.0,6.0,2
16738143,5.5174528964647065,Apartment,Entire home/apt,"{""Wireless Intern...",5,1.0,Real Bed,moderate,True,SF,Large living/play...,2017-07-31,True,True,100.0,2013-08-15,False,2017-08-11,37.77213223870029,-122.43379710422008,Family and kid fr...,Lower Haight,4,90.0,https://a0.muscac...,94117.0,3.0,2
15421244,5.796057750765373,House,Entire home/apt,"{TV,""Cable TV"",In...",4,2.0,Real Bed,strict,True,LA,"Built in 1925, th...",,True,True,100.0,2011-06-11,False,,34.09847643495662,-118.25521367750434,Silver Lake Hillt...,Silver Lake,0,,https://a0.muscac...,90039.0,2.0,2
15266605,5.5606816310155285,Apartment,Entire home/apt,"{TV,Internet,""Wir...",3,1.0,Real Bed,strict,True,NYC,Ideal for a coupl...,2015-07-24,True,True,60.0,2013-09-18,False,2017-07-31,40.78985910942261,-73.9739339272807,Bright/Large 1BR ...,Upper West Side,12,89.0,,10024.0,2.0,2
19608383,5.416100402204419,House,Entire home/apt,"{TV,Internet,""Wir...",6,1.0,Real Bed,strict,True,LA,You’ll love my pl...,2016-07-25,True,True,100.0,2015-01-04,False,2016-12-31,33.84448064632771,-118.38276225549444,"Charming Home, Pr...",Redondo Beach,4,95.0,https://a0.muscac...,90277.0,6.0,2
15805757,5.247024072160486,Apartment,Entire home/apt,"{TV,Internet,""Wir...",5,2.0,Real Bed,strict,True,LA,Spacious West Hol...,2015-08-01,True,True,100.0,2015-07-22,True,2017-04-16,34.08837249233463,-118.37471799030315,Cali Character in...,West Hollywood,47,90.0,https://a0.muscac...,90069.0,2.0,2
6603860,5.700443573390688,Apartment,Entire home/apt,"{TV,""Cable TV"",In...",2,1.5,Real Bed,moderate,True,SF,Come visit San Fr...,,True,True,,2012-07-13,False,,37.79223214047678,-122.41878698332764,Spacious 2BR apar...,Nob Hill,0,,https://a0.muscac...,94109.0,2.0,2


In [8]:
# calculate uniques for a column (unused)
def uniques(df, col):
    return list(map(lambda x: x[col], airbnb.select(col).distinct().collect()))

# count null values across columns
def cnull(df):
    return {col:df.filter(df[col].isNull()).count() for col in df.columns}

display(cnull(airbnb))
airbnb.printSchema()

Py4JJavaError: An error occurred while calling o117.count.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=2.0
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
	at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
	at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
	at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
	at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
	at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
	at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
	at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
	at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
	at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
	at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
	at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
	at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
	at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
	at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
	at java.util.concurrent.ForkJoinTask.doJoin(ForkJoinTask.java:389)
	at java.util.concurrent.ForkJoinTask.join(ForkJoinTask.java:719)
	at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync(Tasks.scala:379)
	at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync$(Tasks.scala:379)
	at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.sync(Tasks.scala:440)
	at scala.collection.parallel.ForkJoinTasks.executeAndWaitResult(Tasks.scala:423)
	at scala.collection.parallel.ForkJoinTasks.executeAndWaitResult$(Tasks.scala:416)
	at scala.collection.parallel.ForkJoinTaskSupport.executeAndWaitResult(TaskSupport.scala:60)
	at scala.collection.parallel.ParIterableLike$ResultMapping.leaf(ParIterableLike.scala:968)
	at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
	at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
	at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
	at scala.collection.parallel.ParIterableLike$ResultMapping.tryLeaf(ParIterableLike.scala:963)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
	at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
	at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
	at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
	at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056)
	at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
	at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175)
	Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=5.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
		at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
		at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
		at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
		at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
		at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
		at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
		at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
		at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
		at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
		at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
		at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
		at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
		at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
		... 7 more
		Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=8.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
			at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
			at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
			at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
			at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
			at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
			at scala.Option.getOrElse(Option.scala:189)
			at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
			at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
			at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
			at scala.Option.getOrElse(Option.scala:189)
			at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
			at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
			at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
			at scala.Option.getOrElse(Option.scala:189)
			at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
			at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
			at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
			at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
			at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
			at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
			at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
			at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
			at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
			at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
			at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
			at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
			at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
			at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
			at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
			at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
			at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
			at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
			at java.util.concurrent.ForkJoinPool.helpStealer(ForkJoinPool.java:1958)
			at java.util.concurrent.ForkJoinPool.awaitJoin(ForkJoinPool.java:2047)
			at java.util.concurrent.ForkJoinTask.doJoin(ForkJoinTask.java:390)
			at java.util.concurrent.ForkJoinTask.join(ForkJoinTask.java:719)
			at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync(Tasks.scala:379)
			at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync$(Tasks.scala:379)
			at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.sync(Tasks.scala:440)
			at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:174)
			at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
			at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
			at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
			at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
			at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
			at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
			at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
			at java.util.concurrent.ForkJoinTask.doJoin(ForkJoinTask.java:389)
			at java.util.concurrent.ForkJoinTask.join(ForkJoinTask.java:719)
			at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync(Tasks.scala:379)
			at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync$(Tasks.scala:379)
			at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.sync(Tasks.scala:440)
			at scala.collection.parallel.ForkJoinTasks.executeAndWaitResult(Tasks.scala:423)
			at scala.collection.parallel.ForkJoinTasks.executeAndWaitResult$(Tasks.scala:416)
			at scala.collection.parallel.ForkJoinTaskSupport.executeAndWaitResult(TaskSupport.scala:60)
			at scala.collection.parallel.ParIterableLike$ResultMapping.leaf(ParIterableLike.scala:968)
			at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
			at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
			at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
			at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
			at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
			at scala.collection.parallel.ParIterableLike$ResultMapping.tryLeaf(ParIterableLike.scala:963)
			at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
			... 7 more
		Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=8.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
			... 70 more
	Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=5.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
		... 38 more
	Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=__HIVE_DEFAULT_PARTITION__
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
		at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
		at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
		at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
		at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
		at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
		at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
		at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
		at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
		at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
		at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
		at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
		at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
		at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
		... 7 more
		Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=0.0
			... 39 more
			Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=1.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
				at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
				at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
				at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
				at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
				at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
				at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
				at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
				at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
				at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
				at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
				at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
				at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
				at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
				at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
				at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
				at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
				at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
				at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
				at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
				at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
				at java.util.concurrent.ForkJoinPool.helpStealer(ForkJoinPool.java:1958)
				at java.util.concurrent.ForkJoinPool.awaitJoin(ForkJoinPool.java:2047)
				at java.util.concurrent.ForkJoinTask.doJoin(ForkJoinTask.java:390)
				at java.util.concurrent.ForkJoinTask.join(ForkJoinTask.java:719)
				at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync(Tasks.scala:379)
				at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync$(Tasks.scala:379)
				at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.sync(Tasks.scala:440)
				at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:174)
				... 10 more
			Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=1.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
				... 50 more
		Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=0.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
			... 38 more
	Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=__HIVE_DEFAULT_PARTITION__
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
		... 38 more
	Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=9.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
		at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
		at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
		at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
		at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
		at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
		at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
		at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
		at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
		at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
		at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
		at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
		at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
		at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
		... 7 more
		Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=10.0
			... 39 more
			Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=6.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
				at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
				at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
				at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
				at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
				at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
				at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
				at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
				at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
				at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
				at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
				at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
				at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
				at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
				at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
				at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
				at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
				at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
				at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
				at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
				at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
				at java.util.concurrent.ForkJoinPool.helpStealer(ForkJoinPool.java:1958)
				at java.util.concurrent.ForkJoinPool.awaitJoin(ForkJoinPool.java:2047)
				at java.util.concurrent.ForkJoinTask.doJoin(ForkJoinTask.java:390)
				at java.util.concurrent.ForkJoinTask.join(ForkJoinTask.java:719)
				at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync(Tasks.scala:379)
				at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync$(Tasks.scala:379)
				at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.sync(Tasks.scala:440)
				at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:174)
				... 10 more
			Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=6.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
				... 50 more
		Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=10.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
			... 38 more
		Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=3.0
			... 39 more
			Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=7.0
				... 39 more
				Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=4.0
					at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
					at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
					at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
					at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
					at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
					at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
					at scala.Option.getOrElse(Option.scala:189)
					at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
					at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
					at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
					at scala.Option.getOrElse(Option.scala:189)
					at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
					at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
					at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
					at scala.Option.getOrElse(Option.scala:189)
					at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
					at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
					at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
					at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
					at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
					at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
					at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
					at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
					at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
					at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
					at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
					at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
					at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
					at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
					at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
					at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
					at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
					at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
					at java.util.concurrent.ForkJoinPool.helpStealer(ForkJoinPool.java:1958)
					at java.util.concurrent.ForkJoinPool.awaitJoin(ForkJoinPool.java:2047)
					at java.util.concurrent.ForkJoinTask.doJoin(ForkJoinTask.java:390)
					at java.util.concurrent.ForkJoinTask.join(ForkJoinTask.java:719)
					at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync(Tasks.scala:379)
					at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync$(Tasks.scala:379)
					at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.sync(Tasks.scala:440)
					at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:174)
					... 10 more
				Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=4.0
					at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
					... 50 more
			Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=7.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
				... 38 more
		Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=3.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
			... 38 more
	Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=9.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
		... 38 more
Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=2.0
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
	... 58 more


# Feature selection

In [9]:
import pyspark.sql.functions as F
# drop rows with missing values
df2 = airbnb.select(features + [label]).na.drop()
df2 = df2.withColumnRenamed('log_price', 'label')
df2

Py4JJavaError: An error occurred while calling o151.showString.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=3.0
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
	at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
	at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
	at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
	at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
	at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
	at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
	at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
	at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
	at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
	at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
	at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
	at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
	at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
	at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
	at java.util.concurrent.ForkJoinTask.doJoin(ForkJoinTask.java:389)
	at java.util.concurrent.ForkJoinTask.join(ForkJoinTask.java:719)
	at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync(Tasks.scala:379)
	at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync$(Tasks.scala:379)
	at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.sync(Tasks.scala:440)
	at scala.collection.parallel.ForkJoinTasks.executeAndWaitResult(Tasks.scala:423)
	at scala.collection.parallel.ForkJoinTasks.executeAndWaitResult$(Tasks.scala:416)
	at scala.collection.parallel.ForkJoinTaskSupport.executeAndWaitResult(TaskSupport.scala:60)
	at scala.collection.parallel.ParIterableLike$ResultMapping.leaf(ParIterableLike.scala:968)
	at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
	at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
	at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
	at scala.collection.parallel.ParIterableLike$ResultMapping.tryLeaf(ParIterableLike.scala:963)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
	at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
	at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
	at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
	at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056)
	at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
	at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175)
	Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=9.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
		at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
		at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
		at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
		at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
		at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
		at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
		at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
		at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
		at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
		at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
		at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
		at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
		at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
		... 7 more
		Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=5.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
			at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
			at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
			at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
			at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
			at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
			at scala.Option.getOrElse(Option.scala:189)
			at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
			at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
			at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
			at scala.Option.getOrElse(Option.scala:189)
			at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
			at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
			at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
			at scala.Option.getOrElse(Option.scala:189)
			at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
			at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
			at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
			at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
			at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
			at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
			at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
			at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
			at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
			at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
			at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
			at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
			at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
			at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
			... 7 more
		Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=5.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
			... 35 more
	Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=9.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
		... 38 more
	Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=1.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
		at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
		at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
		at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
		at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
		at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
		at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
		at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
		at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
		at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
		at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
		at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
		at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
		at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
		... 7 more
		Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=10.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
			at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
			at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
			at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
			at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
			at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
			at scala.Option.getOrElse(Option.scala:189)
			at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
			at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
			at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
			at scala.Option.getOrElse(Option.scala:189)
			at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
			at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
			at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
			at scala.Option.getOrElse(Option.scala:189)
			at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
			at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
			at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
			at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
			at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
			at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
			at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
			at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
			at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
			at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
			at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
			at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
			at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
			at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:170)
			... 10 more
		Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=10.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
			... 38 more
	Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=1.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
		... 38 more
	Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=7.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
		at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
		at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
		at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
		at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
		at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
		at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
		at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
		at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
		at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
		at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
		at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
		at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
		at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
		... 7 more
		Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=2.0
			... 39 more
			Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=4.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
				at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
				at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
				at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
				at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
				at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
				at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
				at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
				at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
				at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
				at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
				at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
				at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
				at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
				at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
				at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
				at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:170)
				... 10 more
			Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=4.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
				... 38 more
		Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=2.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
			... 38 more
		Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=0.0
			... 39 more
			Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=8.0
				... 39 more
				Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=6.0
					at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
					at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
					at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
					at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
					at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
					at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
					at scala.Option.getOrElse(Option.scala:189)
					at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
					at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
					at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
					at scala.Option.getOrElse(Option.scala:189)
					at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
					at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
					at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
					at scala.Option.getOrElse(Option.scala:189)
					at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
					at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
					at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
					at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
					at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
					at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
					at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
					at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
					at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
					at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
					at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
					at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
					at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
					at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
					at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
					at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
					at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
					at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
					at java.util.concurrent.ForkJoinPool.helpStealer(ForkJoinPool.java:1958)
					at java.util.concurrent.ForkJoinPool.awaitJoin(ForkJoinPool.java:2047)
					at java.util.concurrent.ForkJoinTask.doJoin(ForkJoinTask.java:390)
					at java.util.concurrent.ForkJoinTask.join(ForkJoinTask.java:719)
					at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync(Tasks.scala:379)
					at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync$(Tasks.scala:379)
					at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.sync(Tasks.scala:440)
					at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:174)
					... 10 more
				Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=6.0
					at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
					... 50 more
			Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=8.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
				... 38 more
		Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=0.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
			... 38 more
	Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=7.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
		... 38 more
Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=3.0
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
	... 58 more


Py4JJavaError: An error occurred while calling o151.getRowsToPython.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=5.0
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
	at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
	at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
	at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
	at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
	at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
	at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
	at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
	at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
	at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
	at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
	at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
	at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
	at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
	at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
	at java.util.concurrent.ForkJoinTask.doJoin(ForkJoinTask.java:389)
	at java.util.concurrent.ForkJoinTask.join(ForkJoinTask.java:719)
	at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync(Tasks.scala:379)
	at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync$(Tasks.scala:379)
	at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.sync(Tasks.scala:440)
	at scala.collection.parallel.ForkJoinTasks.executeAndWaitResult(Tasks.scala:423)
	at scala.collection.parallel.ForkJoinTasks.executeAndWaitResult$(Tasks.scala:416)
	at scala.collection.parallel.ForkJoinTaskSupport.executeAndWaitResult(TaskSupport.scala:60)
	at scala.collection.parallel.ParIterableLike$ResultMapping.leaf(ParIterableLike.scala:968)
	at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
	at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
	at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
	at scala.collection.parallel.ParIterableLike$ResultMapping.tryLeaf(ParIterableLike.scala:963)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
	at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
	at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
	at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
	at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
	at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056)
	at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
	at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175)
	Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=6.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
		at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
		at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
		at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
		at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
		at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
		at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
		at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
		at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
		at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
		at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
		at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
		at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
		at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
		... 7 more
		Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=__HIVE_DEFAULT_PARTITION__
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
			at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
			at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
			at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
			at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
			at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
			at scala.Option.getOrElse(Option.scala:189)
			at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
			at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
			at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
			at scala.Option.getOrElse(Option.scala:189)
			at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
			at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
			at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
			at scala.Option.getOrElse(Option.scala:189)
			at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
			at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
			at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
			at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
			at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
			at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
			at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
			at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
			at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
			at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
			at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
			at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
			at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
			at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
			... 7 more
		Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=__HIVE_DEFAULT_PARTITION__
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
			... 35 more
	Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=6.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
		... 38 more
	Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=10.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
		at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
		at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
		at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
		at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
		at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
		at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
		at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
		at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
		at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
		at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
		at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
		at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
		at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
		... 7 more
		Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=3.0
			... 39 more
			Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=2.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
				at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
				at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
				at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
				at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
				at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
				at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
				at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
				at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
				at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
				at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
				at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
				at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
				at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
				at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
				at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
				at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
				at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute$(Tasks.scala:149)
				at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.compute(Tasks.scala:440)
				at java.util.concurrent.RecursiveAction.exec(RecursiveAction.java:189)
				at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
				at java.util.concurrent.ForkJoinPool.helpStealer(ForkJoinPool.java:1958)
				at java.util.concurrent.ForkJoinPool.awaitJoin(ForkJoinPool.java:2047)
				at java.util.concurrent.ForkJoinTask.doJoin(ForkJoinTask.java:390)
				at java.util.concurrent.ForkJoinTask.join(ForkJoinTask.java:719)
				at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync(Tasks.scala:379)
				at scala.collection.parallel.ForkJoinTasks$WrappedTask.sync$(Tasks.scala:379)
				at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.sync(Tasks.scala:440)
				at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:174)
				... 10 more
			Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=2.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
				... 50 more
		Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=3.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
			... 38 more
	Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=10.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
		... 38 more
	Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=8.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
		at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
		at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
		at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
		at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
		at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
		at scala.Option.getOrElse(Option.scala:189)
		at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
		at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
		at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
		at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
		at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
		at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
		at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
		at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
		at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
		at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
		at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal(Tasks.scala:160)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.internal$(Tasks.scala:157)
		at scala.collection.parallel.AdaptiveWorkStealingForkJoinTasks$WrappedTask.internal(Tasks.scala:440)
		at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:150)
		... 7 more
		Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=4.0
			... 39 more
			Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=9.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
				at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
				at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
				at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
				at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
				at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
				at scala.Option.getOrElse(Option.scala:189)
				at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
				at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
				at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
				at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
				at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
				at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
				at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
				at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
				at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
				at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
				at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
				at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
				at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
				at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
				... 7 more
			Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=9.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
				... 35 more
		Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=4.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
			... 38 more
		Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=1.0
			... 39 more
			Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=7.0
				... 39 more
				Suppressed: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=0.0
					at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
					at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
					at org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.listStatus(AvroContainerInputFormat.java:42)
					at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
					at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
					at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
					at scala.Option.getOrElse(Option.scala:189)
					at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
					at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
					at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
					at scala.Option.getOrElse(Option.scala:189)
					at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
					at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
					at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
					at scala.Option.getOrElse(Option.scala:189)
					at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
					at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1(UnionRDD.scala:85)
					at org.apache.spark.rdd.UnionRDD.$anonfun$getPartitions$1$adapted(UnionRDD.scala:85)
					at scala.collection.parallel.AugmentedIterableIterator.map2combiner(RemainsIterator.scala:116)
					at scala.collection.parallel.AugmentedIterableIterator.map2combiner$(RemainsIterator.scala:113)
					at scala.collection.parallel.immutable.ParVector$ParVectorIterator.map2combiner(ParVector.scala:66)
					at scala.collection.parallel.ParIterableLike$Map.leaf(ParIterableLike.scala:1064)
					at scala.collection.parallel.Task.$anonfun$tryLeaf$1(Tasks.scala:53)
					at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
					at scala.util.control.Breaks$$anon$1.catchBreak(Breaks.scala:67)
					at scala.collection.parallel.Task.tryLeaf(Tasks.scala:56)
					at scala.collection.parallel.Task.tryLeaf$(Tasks.scala:50)
					at scala.collection.parallel.ParIterableLike$Map.tryLeaf(ParIterableLike.scala:1061)
					at scala.collection.parallel.AdaptiveWorkStealingTasks$WrappedTask.compute(Tasks.scala:153)
					... 7 more
				Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=0.0
					at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
					... 35 more
			Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=7.0
				at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
				... 38 more
		Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=1.0
			at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
			... 38 more
	Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=8.0
		at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
		... 38 more
Caused by: java.io.IOException: Input path does not exist: hdfs://hadoop-02.uni.innopolis.ru:8020/user/team23/project/warehouse/airbnb_houses_part_buck/bedrooms=5.0
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
	... 58 more


In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Word2Vec, Tokenizer, RegexTokenizer
from pyspark.sql.functions import col

categoricalCols = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city', 'neighbourhood', 'zipcode']
textCols = ['name', 'description']
dateCols = ['host_since']
booleanCols = ['cleaning_fee', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
numericalCols = ['accommodates', 'bathrooms', 'number_of_reviews', 'beds', 'bedrooms']
geoCols = [['latitude', 'longitude']]
jsonCols = ['amenities']

# cast all boolean and numerical columns to the same type
numericalCols += booleanCols
for c in numericalCols:
    df2 = df2.withColumn(c, df2[c].cast('float'))
df2

property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,city,description,host_has_profile_pic,host_identity_verified,host_since,instant_bookable,latitude,longitude,name,neighbourhood,number_of_reviews,zipcode,beds,bedrooms,label
House,Entire home/apt,"{TV,""Wireless Int...",7.0,2.0,Real Bed,strict,1.0,LA,这间四房的PUD位于和亚凯迪亚的交...,1.0,1.0,2017-02-01,1.0,34.130149873646424,-118.01142964459407,洛杉矶之家 Home by Arc...,Arcadia,0.0,91016.0,4.0,4.0,5.594711379601837
House,Entire home/apt,"{TV,""Cable TV"",""W...",6.0,3.5,Real Bed,strict,1.0,DC,We live in a newl...,1.0,1.0,2015-10-23,0.0,38.971519809471495,-77.0109846035962,New Built Home in...,Takoma,5.0,20012.0,4.0,4.0,5.1929568508902095
Apartment,Entire home/apt,"{TV,""Cable TV"",In...",12.0,1.0,Real Bed,strict,1.0,Chicago,I renovated this ...,1.0,1.0,2016-01-31,0.0,41.886205398796,-87.70363964577317,Urban Oasis @ Ful...,Garfield Park,50.0,60612.0,4.0,4.0,5.075173815233828
House,Entire home/apt,"{TV,""Cable TV"",In...",10.0,2.5,Real Bed,strict,1.0,DC,SPECTACULAR VICTO...,1.0,1.0,2012-11-29,1.0,38.90697613794983,-77.02699276815332,SPECTACULAR HOME ...,Logan Circle,81.0,20005.0,5.0,4.0,6.214608098422191
House,Entire home/apt,"{TV,""Cable TV"",In...",8.0,4.5,Real Bed,strict,1.0,LA,Located centrally...,1.0,1.0,2014-09-08,0.0,34.113917348336905,-118.3864051817073,Hollywood Hills C...,Laurel Canyon,6.0,90046.0,4.0,4.0,7.170119543449628
Townhouse,Entire home/apt,"{TV,Internet,""Wir...",8.0,3.0,Real Bed,strict,1.0,NYC,My quaint & histo...,1.0,0.0,2016-07-12,0.0,40.692738449610665,-73.96507684327614,Spacious & Chic 4...,Clinton Hill,7.0,11205.0,5.0,4.0,6.003887067106539
Apartment,Entire home/apt,"{TV,""Wireless Int...",8.0,2.0,Real Bed,strict,1.0,NYC,Our luxury loft o...,1.0,1.0,2009-12-26,0.0,40.80164249642196,-73.93922139409575,SpaHa Loft: Enorm...,East Harlem,138.0,10035.0,6.0,4.0,5.416100402204419
House,Entire home/apt,"{TV,""Cable TV"",In...",8.0,2.0,Real Bed,strict,1.0,LA,Charming Spanish ...,1.0,1.0,2008-10-16,0.0,34.132253038035465,-118.38372996051108,Laurel Canyon - T...,Studio City,0.0,91604.0,5.0,4.0,5.814130531825067
House,Entire home/apt,"{TV,""Wireless Int...",7.0,1.5,Real Bed,moderate,1.0,LA,My place is close...,1.0,1.0,2016-04-21,0.0,34.111469945695035,-118.1155548058026,Cute House 10 min...,San Gabriel,4.0,91775.0,5.0,4.0,5.272999558563747
House,Entire home/apt,"{TV,Internet,""Wir...",9.0,2.5,Real Bed,strict,0.0,LA,A Vacation Home F...,1.0,1.0,2012-07-17,0.0,34.18856610064488,-118.12231125876929,Beautiful & Tranq...,Altadena,0.0,91001.0,7.0,4.0,6.551080335043403


# Feature extraction

In [11]:
# Custom transformer for date features in YYYY-MM-DD format
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql import DataFrame
from pyspark.sql.types import StringType
from pyspark.ml.linalg import Vectors, VectorUDT
import pyspark.sql.functions as F
import math
    
class YMDTransformer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):  
    @keyword_only
    def __init__(self, inputCol: str = "input", outputCol: str = "output"):
        super(YMDTransformer, self).__init__()
        self.inputCol = inputCol
        self.outputCol = outputCol
  
    def _transform(self, df: DataFrame):
        input_col = self.getInputCol()
        output_col = self.getOutputCol()
        
        DAY_PERIOD = 31
        MONTH_PERIOD = 12
        
        # split the data and cast to float
        input_col = F.split(df[input_col], '-').cast("array<float>")
        
        # apply transform to day
        d_sin = F.sin(2 * math.pi * F.element_at(input_col, 3) / DAY_PERIOD)
        d_cos = F.cos(2 * math.pi * F.element_at(input_col, 3) / DAY_PERIOD)
        
        # apply transform to month
        m_sin = F.sin(2 * math.pi * F.element_at(input_col, 2) / MONTH_PERIOD)
        m_cos = F.cos(2 * math.pi * F.element_at(input_col, 2) / MONTH_PERIOD)
        
        # year remains as is
        y = F.element_at(input_col, 1)
        
        # pack everything into a vector for VectorAssembler
        atov = F.udf(lambda l: Vectors.dense(l), VectorUDT())
        res = F.array(d_sin, d_cos, m_sin, m_cos, y)
        res = atov(res)
        
        return df.withColumn(output_col, res)

a = YMDTransformer(inputCol='123', outputCol="{0}_transformed".format('rar'))

ymd_transformers = [ YMDTransformer(inputCol=c, outputCol="{0}_transformed".format(c)) for c in dateCols ]

In [12]:
# Custom transformer for date features in YYYY-MM-DD format
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCols, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql import DataFrame
from pyspark.sql.types import StringType
from pyspark.ml.linalg import Vectors, VectorUDT
import pyspark.sql.functions as F
import math
    
class ECEFTransformer(Transformer, HasInputCols, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):  
    @keyword_only
    def __init__(self, inputCols: str = "input", outputCol: str = "output"):
        super(ECEFTransformer, self).__init__()
        self.inputCols = inputCols
        self.outputCol = outputCol
  
    def _transform(self, df: DataFrame):
        input_cols = self.getInputCols()
        output_col = self.getOutputCol()
        
        a = 6378137.0;               # WGS-84 semi-major axis
        e2 = 6.6943799901377997e-3;  # WGS-84 first eccentricity squared
        
        lat = df[input_cols[0]]
        lon = df[input_cols[1]]
        
        n = a / F.sqrt(1 - e2 * F.sin(lat) * F.sin(lat));
        x = n * F.cos(lat) * F.cos(lon);    # ECEF x
        y = n * F.cos(lat) * F.sin(lon);    # ECEF y
        z = (n * (1 - e2 ))* F.sin(lat);    # ECEF z
        
        # pack everything into a vector for VectorAssembler
        atov = F.udf(lambda l: Vectors.dense(l), VectorUDT())
        res = F.array(x, y, z)
        res = atov(res)
        
        return df.withColumn(output_col, res)

ecef_transformers = [ ECEFTransformer(inputCols=c, outputCol="{0}_transformed".format(c)) for c in geoCols ]

In [13]:
from pyspark.ml.feature import PCA

# Tokenize textual features by words
tokenizers = [ RegexTokenizer(inputCol=c, outputCol="{0}_tokens".format(c), pattern=" ") for c in textCols ]
tokenizers += [ RegexTokenizer(inputCol=c, outputCol="{0}_tokens".format(c), pattern="[\",{}]+") for c in jsonCols ]

# Vectorize them
vectorizers = [ Word2Vec(vectorSize=50, seed=42, minCount=1, inputCol=tokenizer.getOutputCol(), outputCol="{0}_vectorized".format(tokenizer.getOutputCol())) for tokenizer in tokenizers ]

# Create String indexer to assign index for the string fields where each unique string will get a unique index
# String Indexer is required as an input for One-Hot Encoder 
# We set the case as `skip` for any string out of the input strings
indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)).setHandleInvalid("skip") for c in categoricalCols ]

# Encode the strings using One Hot encoding
# default setting: dropLast=True ==> For example with 5 categories, an input value of 2.0 would map to an output vector of [0.0, 0.0, 1.0, 0.0]. The last category is not included by default (configurable via dropLast), because it makes the vector entries sum up to one, and hence linearly dependent. So an input value of 4.0 maps to [0.0, 0.0, 0.0, 0.0].
encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ]

# This will concatenate the input cols into a single column.
assembler = VectorAssembler(inputCols= \
                            [encoder.getOutputCol() for encoder in encoders] +\
                            [vectorizer.getOutputCol() for vectorizer in vectorizers] +\
                            [ymd.getOutputCol() for ymd in ymd_transformers] +\
                            [ecef.getOutputCol() for ecef in ecef_transformers] +\
                            numericalCols, outputCol= "features")

# Apply PCA to reduce dimetionalty and reduce computation time
pca = PCA(k=200, inputCol='features', outputCol='components')

# You can create a pipeline to use only a single fit and transform on the data.
pipeline = Pipeline(stages=ymd_transformers + ecef_transformers + tokenizers + vectorizers + indexers + encoders + [assembler] + [pca])


# Fit the pipeline ==> This will call the fit functions for all transformers if exist
model = pipeline.fit(df2)
# Fit the pipeline ==> This will call the transform functions for all transformers
data = model.transform(df2)

display(data)

# We delete all features and keep only the features and label columns
transformed = data.select(["components", "label"])
transformed = transformed.withColumnRenamed('components', 'features')


from pyspark.ml.feature import VectorIndexer

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4
# distinct values are treated as continuous.
#featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=1000).fit(data)
#transformed = featureIndexer.transform(data)

# Display the output Spark DataFrame
display(transformed)

AttributeError: 'YMDTransformer' object has no attribute 'host_since_transformed'

# Split the dataset

In [None]:
#  split the data into 60% training and 40% test (it is not stratified)
(train_data, test_data) = transformed.randomSplit([0.6, 0.4], seed = 10)

In [None]:
# replaced coalesce(1) with repartition(1) to fix OoM issue
def run(command):
    import os
    return os.popen(command).read()

train_data.select("features", "label")\
    .repartition(1)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/train")

# Run it from root directory of the repository
run("hdfs dfs -cat project/data/train/*.json > data/train.json")

test_data.select("features", "label")\
    .repartition(1)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/test")

# Run it from root directory of the repository
run("hdfs dfs -cat project/data/test/*.json > data/test.json")

# First model

## Build a model

In [None]:
from pyspark.ml.regression import LinearRegression
# Create Linear Regression Model
lr = LinearRegression()

# Fit the data to the pipeline stages
model_lr = lr.fit(train_data)

## Predict for test data

In [None]:
predictions = model_lr.transform(test_data)
predictions

## Evaluate the model

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator1_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator1_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator1_rmse.evaluate(predictions)
r2 = evaluator1_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse))
print("R^2 on test data = {}".format(r2))

## Hyperparameter optimization

In [None]:
model_lr.params

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator 

import numpy as np


grid = ParamGridBuilder()
grid = grid.addGrid(
                    model_lr.aggregationDepth, [2, 3, 4])\
                    .addGrid(model_lr.regParam, np.logspace(1e-3,1e-1)
                    )\
                    .build()

cv = CrossValidator(estimator = lr, 
                    estimatorParamMaps = grid, 
                    evaluator = evaluator1_rmse,
                    parallelism = 5,
                    numFolds=2)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

## Best model 1


In [None]:
from pprint import pprint
model1 = bestModel
pprint(model1.extractParamMap())

## Save the model to HDFS

In [None]:
model1.write().overwrite().save("project/models/model1")

# Run it from root directory of the repository
run("hdfs dfs -get project/models/model1 models/model1")

## Predict for test data using best model1

In [None]:
predictions = model1.transform(test_data)
predictions.show()

In [None]:
predictions.select("label", "prediction")\
    .repartition(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/output/model1_predictions.csv")

# Run it from root directory of the repository
run("hdfs dfs -cat project/output/model1_predictions.csv/*.csv > output/model1_predictions.csv")

## Evaluate the best model1

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator1_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator1_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse1 = evaluator1_rmse.evaluate(predictions)
r21 = evaluator1_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse1))
print("R^2 on test data = {}".format(r21))

# Second model

## Build a model

In [None]:
from pyspark.ml.regression import GBTRegressor

# Create Linear Regression Model
gbt = GBTRegressor()

# Fit the data to the pipeline stages
model_gbt = gbt.fit(train_data)

## Predict for test data

In [None]:
predictions = model_gbt.transform(test_data)
predictions.show()

## Evaluate the model

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator2_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator2_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse2 = evaluator2_rmse.evaluate(predictions)
r22 = evaluator2_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse2))
print("R^2 on test data = {}".format(r22))

## Hyperparameter optimization

In [None]:
model_gbt.params

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator 

import numpy as np


grid = ParamGridBuilder()
grid = grid.addGrid(model_gbt.maxDepth, [2, 5]).addGrid(model_gbt.lossType, ['squared', 'absolute']).build()

cv = CrossValidator(estimator = gbt, 
                    estimatorParamMaps = grid, 
                    evaluator = evaluator2_rmse,
                    parallelism = 5,
                    numFolds=2)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

## Best model 2


In [None]:
from pprint import pprint
model2 = bestModel
pprint(model2.extractParamMap())

## Save the model to HDFS

In [None]:
model2.write().overwrite().save("project/models/model2")

# Run it from root directory of the repository
run("hdfs dfs -get project/models/model2 models/model2")

## Predict for test data using best model2

In [None]:
predictions = model2.transform(test_data)
predictions.show()

In [None]:
predictions.select("label", "prediction")\
    .repartition(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/output/model2_predictions.csv")

# Run it from root directory of the repository
run("hdfs dfs -cat project/output/model2_predictions.csv/*.csv > output/model2_predictions.csv")

## Evaluate the best model2

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator2_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator2_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse2 = evaluator2_rmse.evaluate(predictions)
r22 = evaluator2_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse2))
print("R^2 on test data = {}".format(r22))

# Compare best models

In [None]:
models = [[str(model1),rmse1, r21], [str(model2),rmse2, r22]]

df = spark.createDataFrame(models, ["model", "RMSE", "R2"])
df.show(truncate=False)

In [None]:
df.repartition(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/output/evaluation.csv")

# Run it from root directory of the repository
run("hdfs dfs -cat project/output/evaluation.csv/*.csv > output/evaluation.csv")