# Assignment

## About Dataset:

The dataset is frome **Kaggle**(https://www.kaggle.com/datasets/jessemostipak/hotel-booking-demand).

## Install:

In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): still running...
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845518 sha256=91713f5be2c8981469506de6d6d61e39c77062ec02d87584b16912cdcd056a81
  Stored in directory: c:\users\acer\appdata\local\pip\cache\wheels\43\dc\11\ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.3.1


## Import Libraries:

In [3]:
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import pyspark
from pyspark.sql import functions as F
from pyspark.sql import types

from pyspark.ml.feature import Imputer, VectorAssembler, StringIndexer
from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor, GBTRegressor, LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

warnings.filterwarnings('ignore')

## Connect to the Spark server:

In [4]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

## Obtain the Data:

In [5]:
fullpath = 'hotel_bookings.csv'

data = spark.read.csv(fullpath)

In [6]:
data

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string, _c18: string, _c19: string, _c20: string, _c21: string, _c22: string, _c23: string, _c24: string, _c25: string, _c26: string, _c27: string, _c28: string, _c29: string, _c30: string, _c31: string]

In [7]:
# read.csv is very similar to the Pandas version
data = spark.read.csv(fullpath,
                     sep=',',
                     inferSchema=True,
                     header=True,
                     multiLine=True)

data.printSchema()

root
 |-- hotel: string (nullable = true)
 |-- is_canceled: integer (nullable = true)
 |-- lead_time: integer (nullable = true)
 |-- arrival_date_year: integer (nullable = true)
 |-- arrival_date_month: string (nullable = true)
 |-- arrival_date_week_number: integer (nullable = true)
 |-- arrival_date_day_of_month: integer (nullable = true)
 |-- stays_in_weekend_nights: integer (nullable = true)
 |-- stays_in_week_nights: integer (nullable = true)
 |-- adults: integer (nullable = true)
 |-- children: string (nullable = true)
 |-- babies: integer (nullable = true)
 |-- meal: string (nullable = true)
 |-- country: string (nullable = true)
 |-- market_segment: string (nullable = true)
 |-- distribution_channel: string (nullable = true)
 |-- is_repeated_guest: integer (nullable = true)
 |-- previous_cancellations: integer (nullable = true)
 |-- previous_bookings_not_canceled: integer (nullable = true)
 |-- reserved_room_type: string (nullable = true)
 |-- assigned_room_type: string (nullab

## Data Cleaning:

### Drop unneeded columns:

In [8]:
# these columns are useless to us, drop them
drop_cols = ['agent', 'company']

data = data.drop(*drop_cols)

### Check for duplicates:

In [9]:
print('Count of rows: {0}'.format(data.count()))
print('Count of distinct rows: {0}'.format(data.distinct().count()))

Count of rows: 119390
Count of distinct rows: 87370


In [14]:
# Check for missing values
data.select(*[
    (
        F.count(F.when((F.isnan(c) | F.col(c).isNull()), c)) if t not in ("timestamp", "date")
        else F.count(F.when(F.col(c).isNull(), c))
    ).alias(c)
    for c, t in data.dtypes if c in data.columns
    ]).show()

+-----+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+--------+------+----+-------+--------------+--------------------+-----------------+----------------------+------------------------------+------------------+------------------+---------------+------------+--------------------+-------------+---+---------------------------+-------------------------+------------------+-----------------------+
|hotel|is_canceled|lead_time|arrival_date_year|arrival_date_month|arrival_date_week_number|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|adults|children|babies|meal|country|market_segment|distribution_channel|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|reserved_room_type|assigned_room_type|booking_changes|deposit_type|days_in_waiting_list|customer_type|adr|required_car_parking_spaces|total_of_special_requests|reservation_status|reser

In [15]:
# Drop the null values from humidity and barometer columns
data.na.drop(how="any").show(truncate=False)

+------------+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+--------+------+----+-------+--------------+--------------------+-----------------+----------------------+------------------------------+------------------+------------------+---------------+------------+--------------------+-------------+------+---------------------------+-------------------------+------------------+-----------------------+
|hotel       |is_canceled|lead_time|arrival_date_year|arrival_date_month|arrival_date_week_number|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|adults|children|babies|meal|country|market_segment|distribution_channel|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|reserved_room_type|assigned_room_type|booking_changes|deposit_type|days_in_waiting_list|customer_type|adr   |required_car_parking_spaces|total_of_special_requests|rese

In [19]:
# Latitude should be between (-90, 90), otherwise set null
data = data.withColumn('is_canceled', F.when(F.col('is_canceled').rlike('^-?\d+\.?\d+$'), F.col('is_canceled')))
data = data.withColumn('is_canceled', F.col('is_canceled').cast(types.FloatType()))
data = data.withColumn('is_canceled', F.when((-90 <= F.col('is_canceled')) & (F.col('is_canceled') <= 90), F.col('is_canceled')))

In [20]:
# Longitude should be between (-180, 180), otherwise set null
data = data.withColumn('is_repeated_guest', F.when(F.col('is_repeated_guest').rlike('^-?\d+\.?\d+$'), F.col('is_repeated_guest')))
data = data.withColumn('is_repeated_guest', F.col('is_repeated_guest').cast(types.FloatType()))
data = data.withColumn('is_repeated_guest', F.when((-180 <= F.col('is_repeated_guest')) & (F.col('is_repeated_guest') <= 180), F.col('is_repeated_guest')))

In [21]:
# Posting date should be in format yyyy-MM-dd HH:mm:ss.SSSS
data = data.withColumn('arrival_date_year', F.when(F.col('arrival_date_year').rlike('^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}-\d{4}$'), F.col('arrival_date_year')))
data = data.withColumn('arrival_date_year', F.col('arrival_date_year').cast(types.TimestampType()))

### Descriptive Statistics:

In [22]:
data.select('is_canceled','is_repeated_guest','arrival_date_year').summary().show()

+-------+-----------+-----------------+
|summary|is_canceled|is_repeated_guest|
+-------+-----------+-----------------+
|  count|          0|                0|
|   mean|       null|             null|
| stddev|       null|             null|
|    min|       null|             null|
|    25%|       null|             null|
|    50%|       null|             null|
|    75%|       null|             null|
|    max|       null|             null|
+-------+-----------+-----------------+



### Data Wrangling:

In [23]:
# both columns starts with 0, log(0) is undefined so we should put +1
data = data.withColumn('is_canceled', F.log10(F.col('is_canceled') + 1))
data = data.withColumn('is_repeated_guest', F.log10(F.col('is_repeated_guest') + 1))

In [24]:
data.select('is_canceled','is_repeated_guest','arrival_date_year').summary().show()

+-------+-----------+-----------------+
|summary|is_canceled|is_repeated_guest|
+-------+-----------+-----------------+
|  count|          0|                0|
|   mean|       null|             null|
| stddev|       null|             null|
|    min|       null|             null|
|    25%|       null|             null|
|    50%|       null|             null|
|    75%|       null|             null|
|    max|       null|             null|
+-------+-----------+-----------------+



### Data Exploration:

In [27]:
data.registerTempTable('data.hotel')

state_counts = spark.sql(r"""SELECT state, COUNT(state) AS total 
                                     FROM data.hotel 
                                     GROUP BY state 
                                     ORDER BY total desc """)
state_counts.show()

Py4JJavaError: An error occurred while calling o23.sql.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:286)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:978)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:660)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:700)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:788)
	at org.apache.spark.sql.catalyst.catalog.InMemoryCatalog.liftedTree1$1(InMemoryCatalog.scala:123)
	at org.apache.spark.sql.catalyst.catalog.InMemoryCatalog.createDatabase(InMemoryCatalog.scala:120)
	at org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:153)
	at org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:140)
	at org.apache.spark.sql.internal.SharedState.globalTempViewManager$lzycompute(SharedState.scala:170)
	at org.apache.spark.sql.internal.SharedState.globalTempViewManager(SharedState.scala:168)
	at org.apache.spark.sql.internal.BaseSessionStateBuilder.$anonfun$catalog$2(BaseSessionStateBuilder.scala:154)
	at org.apache.spark.sql.catalyst.catalog.SessionCatalog.globalTempViewManager$lzycompute(SessionCatalog.scala:122)
	at org.apache.spark.sql.catalyst.catalog.SessionCatalog.globalTempViewManager(SessionCatalog.scala:122)
	at org.apache.spark.sql.catalyst.catalog.SessionCatalog.lookupGlobalTempView(SessionCatalog.scala:949)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.lookupTempView(Analyzer.scala:1103)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.org$apache$spark$sql$catalyst$analysis$Analyzer$ResolveRelations$$lookupRelation(Analyzer.scala:1187)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$$anonfun$apply$13.applyOrElse(Analyzer.scala:1059)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$$anonfun$apply$13.applyOrElse(Analyzer.scala:1023)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$3(AnalysisHelper.scala:138)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$1(AnalysisHelper.scala:138)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning(AnalysisHelper.scala:134)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning$(AnalysisHelper.scala:130)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUpWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$2(AnalysisHelper.scala:135)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren(TreeNode.scala:1228)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren$(TreeNode.scala:1227)
	at org.apache.spark.sql.catalyst.plans.logical.Aggregate.mapChildren(basicLogicalOperators.scala:976)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$1(AnalysisHelper.scala:135)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning(AnalysisHelper.scala:134)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning$(AnalysisHelper.scala:130)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUpWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$2(AnalysisHelper.scala:135)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren(TreeNode.scala:1228)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren$(TreeNode.scala:1227)
	at org.apache.spark.sql.catalyst.plans.logical.Sort.mapChildren(basicLogicalOperators.scala:755)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsUpWithPruning$1(AnalysisHelper.scala:135)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning(AnalysisHelper.scala:134)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsUpWithPruning$(AnalysisHelper.scala:130)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsUpWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.apply(Analyzer.scala:1023)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.apply(Analyzer.scala:982)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:211)
	at scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
	at scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
	at scala.collection.immutable.List.foldLeft(List.scala:91)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:208)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:200)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:200)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:227)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$execute$1(Analyzer.scala:223)
	at org.apache.spark.sql.catalyst.analysis.AnalysisContext$.withNewAnalysisContext(Analyzer.scala:172)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:223)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:187)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:179)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:179)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:208)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:330)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:207)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:76)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:185)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:185)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:184)
	at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:76)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:74)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:66)
	at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:622)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:617)
	at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:104)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:547)
	at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:568)
	at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
	at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
	at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
	at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
	at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
	at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
	at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
	at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
	at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
	at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:343)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
	at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:467)
	at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:438)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:515)
	... 22 more


## Histogram:

In [29]:
#top three manufacturers
df_result = spark.sql('SELECT manufacturer, price FROM data WHERE manufacturer IN ("is_canceled", "is_repeated_guest", "arrival_date_year")')

#convert to pandas prior to visualization
df_result = df_result.toPandas()

AnalysisException: Column 'manufacturer' does not exist. Did you mean one of the following? [data.adr, data.adults, data.country, data.hotel, data.babies, data.children, data.meal, data.lead_time, data.is_canceled, data.customer_type, data.deposit_type, data.market_segment, data.arrival_date_year, data.booking_changes, data.assigned_room_type, data.is_repeated_guest, data.reserved_room_type, data.arrival_date_month, data.reservation_status, data.days_in_waiting_list, data.distribution_channel, data.previous_cancellations, data.stays_in_week_nights, data.arrival_date_week_number, data.reservation_status_date, data.stays_in_weekend_nights, data.total_of_special_requests, data.arrival_date_day_of_month, data.required_car_parking_spaces, data.previous_bookings_not_canceled]; line 1 pos 43;
'Project ['manufacturer, 'price]
+- 'Filter 'manufacturer IN (is_canceled,is_repeated_guest,arrival_date_year)
   +- SubqueryAlias data
      +- View (`data`, [hotel#81,is_canceled#1422,lead_time#83,arrival_date_year#1127,arrival_date_month#85,arrival_date_week_number#86,arrival_date_day_of_month#87,stays_in_weekend_nights#88,stays_in_week_nights#89,adults#90,children#91,babies#92,meal#93,country#94,market_segment#95,distribution_channel#96,is_repeated_guest#1453,previous_cancellations#98,previous_bookings_not_canceled#99,reserved_room_type#100,assigned_room_type#101,booking_changes#102,deposit_type#103,days_in_waiting_list#106,customer_type#107,adr#108,required_car_parking_spaces#109,total_of_special_requests#110,reservation_status#111,reservation_status_date#112])
         +- Project [hotel#81, is_canceled#1422, lead_time#83, arrival_date_year#1127, arrival_date_month#85, arrival_date_week_number#86, arrival_date_day_of_month#87, stays_in_weekend_nights#88, stays_in_week_nights#89, adults#90, children#91, babies#92, meal#93, country#94, market_segment#95, distribution_channel#96, LOG10(cast((is_repeated_guest#1065 + cast(1 as float)) as double)) AS is_repeated_guest#1453, previous_cancellations#98, previous_bookings_not_canceled#99, reserved_room_type#100, assigned_room_type#101, booking_changes#102, deposit_type#103, days_in_waiting_list#106, ... 6 more fields]
            +- Project [hotel#81, LOG10(cast((is_canceled#972 + cast(1 as float)) as double)) AS is_canceled#1422, lead_time#83, arrival_date_year#1127, arrival_date_month#85, arrival_date_week_number#86, arrival_date_day_of_month#87, stays_in_weekend_nights#88, stays_in_week_nights#89, adults#90, children#91, babies#92, meal#93, country#94, market_segment#95, distribution_channel#96, is_repeated_guest#1065, previous_cancellations#98, previous_bookings_not_canceled#99, reserved_room_type#100, assigned_room_type#101, booking_changes#102, deposit_type#103, days_in_waiting_list#106, ... 6 more fields]
               +- Project [hotel#81, is_canceled#972, lead_time#83, cast(arrival_date_year#1096 as timestamp) AS arrival_date_year#1127, arrival_date_month#85, arrival_date_week_number#86, arrival_date_day_of_month#87, stays_in_weekend_nights#88, stays_in_week_nights#89, adults#90, children#91, babies#92, meal#93, country#94, market_segment#95, distribution_channel#96, is_repeated_guest#1065, previous_cancellations#98, previous_bookings_not_canceled#99, reserved_room_type#100, assigned_room_type#101, booking_changes#102, deposit_type#103, days_in_waiting_list#106, ... 6 more fields]
                  +- Project [hotel#81, is_canceled#972, lead_time#83, CASE WHEN RLIKE(cast(arrival_date_year#84 as string), ^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}-\d{4}$) THEN arrival_date_year#84 END AS arrival_date_year#1096, arrival_date_month#85, arrival_date_week_number#86, arrival_date_day_of_month#87, stays_in_weekend_nights#88, stays_in_week_nights#89, adults#90, children#91, babies#92, meal#93, country#94, market_segment#95, distribution_channel#96, is_repeated_guest#1065, previous_cancellations#98, previous_bookings_not_canceled#99, reserved_room_type#100, assigned_room_type#101, booking_changes#102, deposit_type#103, days_in_waiting_list#106, ... 6 more fields]
                     +- Project [hotel#81, is_canceled#972, lead_time#83, arrival_date_year#84, arrival_date_month#85, arrival_date_week_number#86, arrival_date_day_of_month#87, stays_in_weekend_nights#88, stays_in_week_nights#89, adults#90, children#91, babies#92, meal#93, country#94, market_segment#95, distribution_channel#96, CASE WHEN ((is_repeated_guest#1034 >= cast(-180 as float)) AND (is_repeated_guest#1034 <= cast(180 as float))) THEN is_repeated_guest#1034 END AS is_repeated_guest#1065, previous_cancellations#98, previous_bookings_not_canceled#99, reserved_room_type#100, assigned_room_type#101, booking_changes#102, deposit_type#103, days_in_waiting_list#106, ... 6 more fields]
                        +- Project [hotel#81, is_canceled#972, lead_time#83, arrival_date_year#84, arrival_date_month#85, arrival_date_week_number#86, arrival_date_day_of_month#87, stays_in_weekend_nights#88, stays_in_week_nights#89, adults#90, children#91, babies#92, meal#93, country#94, market_segment#95, distribution_channel#96, cast(is_repeated_guest#1003 as float) AS is_repeated_guest#1034, previous_cancellations#98, previous_bookings_not_canceled#99, reserved_room_type#100, assigned_room_type#101, booking_changes#102, deposit_type#103, days_in_waiting_list#106, ... 6 more fields]
                           +- Project [hotel#81, is_canceled#972, lead_time#83, arrival_date_year#84, arrival_date_month#85, arrival_date_week_number#86, arrival_date_day_of_month#87, stays_in_weekend_nights#88, stays_in_week_nights#89, adults#90, children#91, babies#92, meal#93, country#94, market_segment#95, distribution_channel#96, CASE WHEN RLIKE(cast(is_repeated_guest#97 as string), ^-?\d+\.?\d+$) THEN is_repeated_guest#97 END AS is_repeated_guest#1003, previous_cancellations#98, previous_bookings_not_canceled#99, reserved_room_type#100, assigned_room_type#101, booking_changes#102, deposit_type#103, days_in_waiting_list#106, ... 6 more fields]
                              +- Project [hotel#81, CASE WHEN ((is_canceled#941 >= cast(-90 as float)) AND (is_canceled#941 <= cast(90 as float))) THEN is_canceled#941 END AS is_canceled#972, lead_time#83, arrival_date_year#84, arrival_date_month#85, arrival_date_week_number#86, arrival_date_day_of_month#87, stays_in_weekend_nights#88, stays_in_week_nights#89, adults#90, children#91, babies#92, meal#93, country#94, market_segment#95, distribution_channel#96, is_repeated_guest#97, previous_cancellations#98, previous_bookings_not_canceled#99, reserved_room_type#100, assigned_room_type#101, booking_changes#102, deposit_type#103, days_in_waiting_list#106, ... 6 more fields]
                                 +- Project [hotel#81, cast(is_canceled#910 as float) AS is_canceled#941, lead_time#83, arrival_date_year#84, arrival_date_month#85, arrival_date_week_number#86, arrival_date_day_of_month#87, stays_in_weekend_nights#88, stays_in_week_nights#89, adults#90, children#91, babies#92, meal#93, country#94, market_segment#95, distribution_channel#96, is_repeated_guest#97, previous_cancellations#98, previous_bookings_not_canceled#99, reserved_room_type#100, assigned_room_type#101, booking_changes#102, deposit_type#103, days_in_waiting_list#106, ... 6 more fields]
                                    +- Project [hotel#81, CASE WHEN RLIKE(cast(is_canceled#82 as string), ^-?\d+\.?\d+$) THEN is_canceled#82 END AS is_canceled#910, lead_time#83, arrival_date_year#84, arrival_date_month#85, arrival_date_week_number#86, arrival_date_day_of_month#87, stays_in_weekend_nights#88, stays_in_week_nights#89, adults#90, children#91, babies#92, meal#93, country#94, market_segment#95, distribution_channel#96, is_repeated_guest#97, previous_cancellations#98, previous_bookings_not_canceled#99, reserved_room_type#100, assigned_room_type#101, booking_changes#102, deposit_type#103, days_in_waiting_list#106, ... 6 more fields]
                                       +- Project [hotel#81, is_canceled#82, lead_time#83, arrival_date_year#84, arrival_date_month#85, arrival_date_week_number#86, arrival_date_day_of_month#87, stays_in_weekend_nights#88, stays_in_week_nights#89, adults#90, children#91, babies#92, meal#93, country#94, market_segment#95, distribution_channel#96, is_repeated_guest#97, previous_cancellations#98, previous_bookings_not_canceled#99, reserved_room_type#100, assigned_room_type#101, booking_changes#102, deposit_type#103, days_in_waiting_list#106, ... 6 more fields]
                                          +- Relation [hotel#81,is_canceled#82,lead_time#83,arrival_date_year#84,arrival_date_month#85,arrival_date_week_number#86,arrival_date_day_of_month#87,stays_in_weekend_nights#88,stays_in_week_nights#89,adults#90,children#91,babies#92,meal#93,country#94,market_segment#95,distribution_channel#96,is_repeated_guest#97,previous_cancellations#98,previous_bookings_not_canceled#99,reserved_room_type#100,assigned_room_type#101,booking_changes#102,deposit_type#103,agent#104,... 8 more fields] csv


In [30]:
sns.set_context("notebook", font_scale=1.25)
sns.histplot(data=df_result, x='is_canceled', hue='is_repeated_guest', binwidth=0.3)

NameError: name 'df_result' is not defined