Read in dataset and set curpath to working directory

In [1]:
import os
import sys
import time
import inspect
import numpy as np
import pandas as pd
from scipy.stats import skew
from scipy.stats import kurtosis

parentPath = '/'.join(sys.path[0].split('/')[:-1])

Create Spark Context & SQLContext

In [2]:
# Creating Spark Context
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession

# Run the first time:
sc = SparkContext("local")

# Use to rerun script:
# sc = SparkContext.getOrCreate("local")
sqlContext = SQLContext(sc)
spark = SparkSession.builder.getOrCreate()

Load dataset from csv, using the "Expedia Hotel Recommendations" Kaggle dataset: https://www.kaggle.com/c/expedia-hotel-recommendations/overview

In [3]:
dfExpedia = spark.read.load(
  parentPath+'/data/train.csv',
  format="csv",
  sep=",",
  inferSchema=True,
  header=True
)

# Expose as SQL table
dfExpedia.createOrReplaceTempView('dfExpedia')

How many records in the training dataset?

In [4]:
dfExpedia.count()

2528243

How many columns in the training dataset?

In [5]:
len(dfExpedia.columns)

22

Take a 1% random sample of the training dataset

In [6]:
dfExpediaSample = dfExpedia.sample(withReplacement=False,fraction=0.01,seed=8)

How many records are in this sample?

In [7]:
dfExpediaSample.count()

25372

Preview the schema inferred when the dataset was read in

In [8]:
dfExpediaSample.printSchema()

root
 |-- id: integer (nullable = true)
 |-- date_time: timestamp (nullable = true)
 |-- site_name: integer (nullable = true)
 |-- posa_continent: integer (nullable = true)
 |-- user_location_country: integer (nullable = true)
 |-- user_location_region: integer (nullable = true)
 |-- user_location_city: integer (nullable = true)
 |-- orig_destination_distance: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- is_mobile: integer (nullable = true)
 |-- is_package: integer (nullable = true)
 |-- channel: integer (nullable = true)
 |-- srch_ci: string (nullable = true)
 |-- srch_co: timestamp (nullable = true)
 |-- srch_adults_cnt: integer (nullable = true)
 |-- srch_children_cnt: integer (nullable = true)
 |-- srch_rm_cnt: integer (nullable = true)
 |-- srch_destination_id: integer (nullable = true)
 |-- srch_destination_type_id: integer (nullable = true)
 |-- hotel_continent: integer (nullable = true)
 |-- hotel_country: integer (nullable = true)
 |-- hotel_market: in

Convert Spark data.frame to Pandas data.frame

In [9]:
pd_dfExpediaSample = dfExpediaSample.toPandas()

Write sample data.frame to disk

In [10]:
pd_dfExpediaSample.to_csv(parentPath+'/data/pd_dfExpediaSample.csv', index=False)

In [None]:
import sys
import os
import inspect
import numpy as np
import pandas as pd
import unittest
import time

sys.path.insert(0, '/'.join(sys.path[0].split('/')[:-1]))

import src.dataPreparation as dp

In [None]:
tic = time.process_time()
parentPath = '/'.join(sys.path[0].split('/'))

df = pd.read_csv(parentPath+'/Expedia-Hotel-Prediction-Workflow/data/pd_dfExpediaSample.csv')

toc = time.process_time()
print("Elapsed time:", round(toc-tic, 3), "seconds")  

In [None]:
tic = time.process_time()

df = dp.updateIDFieldsToCategoricalFeatures(df)

toc = time.process_time()
print("Elapsed time:", round(toc-tic, 3), "seconds")  

In [None]:
tic = time.process_time()

df = dp.updateISFieldsToBooleanFeatures(df)

toc = time.process_time()
print("Elapsed time:", round(toc-tic, 3), "seconds")  

In [None]:
tic = time.process_time()

df = dp.removeHighCardinalityFeatures(df)

toc = time.process_time()
print("Elapsed time:", round(toc-tic, 3), "seconds")  

In [None]:
tic = time.process_time()

df = dp.removeHighNULLCntFeatures(df)

toc = time.process_time()
print("Elapsed time:", round(toc-tic, 3), "seconds")  

In [None]:
tic = time.process_time()

df = dp.removeRemainingRecordsWithNULLS(df)

toc = time.process_time()
print("Elapsed time:", round(toc-tic, 3), "seconds")  

In [None]:
tic = time.process_time()

df = dp.convertCategoricalVariablesToDummyVariables(df)

toc = time.process_time()
print("Elapsed time:", round(toc-tic, 3), "seconds")  

In [None]:
tic = time.process_time()

#pd_dfExpediaSample.to_csv(parentPath+'/Expedia-Hotel-Prediction-Workflow/data/pd_FullCleansedDataset.csv', index=False)

toc = time.process_time()

print("Elapsed time:", round(toc-tic, 3), "seconds")  

In [None]:
tic = time.process_time()

dfExpedia = spark.read.load(
  parentPath+'/Expedia-Hotel-Prediction-Workflow/data/train.csv',
  format="csv",
  sep=",",
  inferSchema=True,
  header=True
)

toc = time.process_time()

print("Elapsed time:", round(toc-tic, 3), "seconds")  

In [None]:
print("Elapsed time:", round(toc-tic, 3)/60, "minutes")  

In [None]:
dfExpedia.printSchema