In [27]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
my_region = boto3.session.Session().region_name # set the region of the instance

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [3]:
# create a s3 bucket to store model related stuff
bucket_name = 'airplane-ticket-model'
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [4]:
# import dataset from s3 bucket
bucket='planetickets'
data_key = 'Cleaned_2018_Flights.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
data = pd.read_csv(data_location)

In [5]:
# Data Cleaning and Splitting Piepline: 
# Remove unnecesscary variables
data = data.drop(columns = ['Unnamed: 0', 'ItinID', 'MktID', 'MktCoupons', 'OriginWac', 'DestWac', 'ContiguousUSA', "Miles"])

# Y: PricePerTicket; X: all other variables
X = data.drop(columns = ['PricePerTicket'])
Y = data[['PricePerTicket']]

# Split training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [25]:
# Feature Transformation Pipeline
numerical_cols = ["NumTicketsOrdered"]
categorical_cols = ['Quarter', "Origin", "Dest", "AirlineCompany"]

In [26]:
# Feature engineering Pipeline
"""
Quarter: categorical
Origin: categorical
Dest: categorical
NumTicketsOrdered: Discrete Numeric
AirlineCompany: Categorical

Ideas: use PCA for dimensionality reduction
"""

'\nQuarter: categorical\nOrigin: categorical\nDest: categorical\nNumTicketsOrdered: Discrete Numeric\nAirlineCompany: Categorical\n\nIdeas: use PCA for dimensionality reduction\n'

In [17]:
(6388059, 4 + 263 + 260 + 1 + 12)

(6388059, 540)

In [29]:
enc = OneHotEncoder()
enc.fit(data[['AirlineCompany']])

OneHotEncoder()

In [None]:
enc.transform(data[['AirlineCompany']]).toarray()