Read in dataset and set curpath to working directory

In [23]:
import sys
import os
import inspect
import numpy as np
import pandas as pd
from scipy.stats import skew
from scipy.stats import kurtosis

parentPath = '/'.join(sys.path[0].split('/')[:-1])

Create Spark Context & SQLContext

In [24]:
# Creating Spark Context
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession

# Run the first time:
sc = SparkContext("local")

# Use to rerun script:
# sc = SparkContext.getOrCreate("local")
sqlContext = SQLContext(sc)
spark = SparkSession.builder.getOrCreate()

Load dataset from csv, using the "Expedia Hotel Recommendations" Kaggle dataset: https://www.kaggle.com/c/expedia-hotel-recommendations/overview

In [25]:
dfExpedia = spark.read.load(
  parentPath+'/data/train.csv',
  format="csv",
  sep=",",
  inferSchema=True,
  header=True
)

# Expose as SQL table
dfExpedia.createOrReplaceTempView('dfExpedia')

KeyboardInterrupt: 

How many records in the training dataset?

In [7]:
dfExpedia.count()

37670293

How many columns in the training dataset?

In [8]:
len(dfExpedia.columns)

24

Take a 1% random sample of the training dataset

In [27]:
dfExpediaSample = dfExpedia.sample(withReplacement=False,fraction=0.01,seed=8)

How many records are in this sample?

In [28]:
dfExpediaSample.count()

375534

Preview the schema inferred when the dataset was read in

In [13]:
dfExpediaSample.printSchema()

root
 |-- date_time: timestamp (nullable = true)
 |-- site_name: integer (nullable = true)
 |-- posa_continent: integer (nullable = true)
 |-- user_location_country: integer (nullable = true)
 |-- user_location_region: integer (nullable = true)
 |-- user_location_city: integer (nullable = true)
 |-- orig_destination_distance: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- is_mobile: integer (nullable = true)
 |-- is_package: integer (nullable = true)
 |-- channel: integer (nullable = true)
 |-- srch_ci: timestamp (nullable = true)
 |-- srch_co: timestamp (nullable = true)
 |-- srch_adults_cnt: integer (nullable = true)
 |-- srch_children_cnt: integer (nullable = true)
 |-- srch_rm_cnt: integer (nullable = true)
 |-- srch_destination_id: integer (nullable = true)
 |-- srch_destination_type_id: integer (nullable = true)
 |-- is_booking: integer (nullable = true)
 |-- cnt: integer (nullable = true)
 |-- hotel_continent: integer (nullable = true)
 |-- hotel_country: 

Parse out timestamp and unique identifier columns

In [None]:
dfExpediaSample = dfExpediaSample.selectExpr(
                                             'site_name'
                                            ,'posa_continent'
                                            ,'user_location_country'
                                            ,'user_location_region'
                                            ,'user_location_city'
                                            ,'orig_destination_distance'
                                            ,'is_mobile'
                                            ,'is_package'
                                            ,'srch_adults_cnt'
                                            ,'srch_children_cnt'
                                            ,'srch_rm_cnt'
                                            ,'srch_destination_id'
                                            ,'srch_destination_type_id'
                                            ,'is_booking'
                                            ,'cnt'
                                            ,'hotel_continent'
                                            ,'hotel_country'
                                            ,'hotel_market'
                                            ,'hotel_cluster'
                                            )

dfExpediaSample.createOrReplaceTempView('dfExpediaSample') # Expose as SQL table

Convert Spark data.frame to Pandas data.frame

In [29]:
pd_dfExpediaSample = dfExpediaSample.toPandas()

Write sample data.frame to disk

In [30]:
pd_dfExpediaSample.to_csv(parentPath+'/data/pd_dfExpediaSample.csv', index=False)

In [1]:
pd_dfExpediaSample.head()

NameError: name 'pd_dfExpediaSample' is not defined

Create Exploratory Data Analysis Report on numeric variables

In [28]:
# Subset the numeric features
pd_dfExpediaSamplenum = pd_dfExpediaSample[pd_dfExpediaSample.describe(include=[np.number]).columns]

# Create the numeric variable table
for i in range(0, len(pd_dfExpediaSamplenum.columns)):
    if i == 0:
        report = pd.DataFrame({
            'Variable_Name': [pd_dfExpediaSamplenum.columns[i]],
            'Data_Type': [str(pd_dfExpediaSamplenum.iloc[:,[i]].dtypes[0])],
            'Mean': round(np.mean(pd_dfExpediaSamplenum.iloc[:,[i]].dropna()).sum(), 2),
            'Standard_Deviation': round(np.std(pd_dfExpediaSamplenum.iloc[:,[i]].dropna()).sum(), 2),
            'Skew.2SE': '%.2f'%(skew(pd_dfExpediaSamplenum.iloc[:,[i]])/(pd_dfExpediaSamplenum.iloc[:,[i]].sem()*2)),
            'Kurtosis.2SE': '%.2f'%(kurtosis(pd_dfExpediaSamplenum.iloc[:,[i]].dropna())[0]/(pd_dfExpediaSamplenum.iloc[:,[i]].sem()*2)),
            'Missing_Values': [pd_dfExpediaSamplenum.iloc[:,[i]].isna().sum().sum()],
            'Percent_Missing': ['%.2f'%(pd_dfExpediaSamplenum.iloc[:,[i]].isna().sum().sum()/len(pd_dfExpediaSamplenum)*100)]
            })
    else:
        report.loc[len(report)] = {
            'Variable_Name': pd_dfExpediaSamplenum.columns[i],
            'Data_Type': str(pd_dfExpediaSamplenum.iloc[:,[i]].dtypes[0]),
            'Mean': round(np.mean(pd_dfExpediaSamplenum.iloc[:,[i]].dropna()).sum(), 2),
            'Standard_Deviation': round(np.std(pd_dfExpediaSamplenum.iloc[:,[i]].dropna()).sum(), 2),
            'Skew.2SE': '%.2f'%(skew(pd_dfExpediaSamplenum.iloc[:,[i]])/(pd_dfExpediaSamplenum.iloc[:,[i]].sem()*2)),
            'Kurtosis.2SE': '%.2f'%(kurtosis(pd_dfExpediaSamplenum.iloc[:,[i]].dropna())[0]/(pd_dfExpediaSamplenum.iloc[:,[i]].sem()*2)),
            'Missing_Values': pd_dfExpediaSamplenum.iloc[:,[i]].isna().sum().sum(),
            'Percent_Missing': '%.2f'%(pd_dfExpediaSamplenum.iloc[:,[i]].isna().sum().sum()/len(pd_dfExpediaSamplenum)*100)
            }
report

Unnamed: 0,Variable_Name,Data_Type,Mean,Standard_Deviation,Skew.2SE,Kurtosis.2SE,Missing_Values,Percent_Missing
0,site_name,int32,9.8,11.97,38.04,2.31,0,0.0
1,posa_continent,int32,2.68,0.75,-730.21,762.48,0,0.0
2,user_location_country,int32,86.13,59.22,6.95,1.85,0,0.0
3,user_location_region,int32,308.95,208.76,2.01,2.73,0,0.0
4,user_location_city,int32,27726.7,16790.19,0.0,-0.03,0,0.0
5,orig_destination_distance,float64,1963.72,2228.11,,0.28,189204,35.93
6,user_id,int32,604039.26,350666.4,-0.0,-0.0,0,0.0
7,is_mobile,int32,0.13,0.34,2281.27,2764.14,0,0.0
8,is_package,int32,0.25,0.43,972.27,-550.29,0,0.0
9,channel,int32,5.86,3.72,-48.4,-150.42,0,0.0


In [36]:
# Remove columns that have more than 25% of records missing
pd_dfExpediaSample.drop(report.query('Missing_Values > 20.0')['Variable_Name'].max(), axis=1, inplace=True)

In [39]:
pd_dfExpediaSample.describe(include=[np.object, pd.Categorical])

ValueError: No objects to concatenate

In [None]:
# correlation matrix looking only at the predictor variable
dfnum = pd_dfExpediaSamplenum[pd_dfExpediaSamplenum.describe(include=[np.number]).columns]

corr = dfnum.corr()

corr

In [None]:
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=64,
    horizontalalignment='right'
);

In [1]:
import sys
import os
import inspect

#filename = inspect.getframeinfo(inspect.currentframe()).filename
#curpath = os.path.dirname(os.path.abspath(filename))

In [11]:
parentPath = sys.path[0].split('/')[1]+'/'+sys.path[0].split('/')[2]

'home/kl'

In [18]:
for x in range(len(sys.path[0].split('/')[:-1])):
    print(x)

0
1
2
3
4


'/home/kl/Documents/Expedia-Hotel-Prediction-Workflow'

In [21]:
help(''.join)

Help on built-in function join:

join(iterable, /) method of builtins.str instance
    Concatenate any number of strings.
    
    The string whose method is called is inserted in between each given string.
    The result is returned as a new string.
    
    Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'

