# 3803ICT Big Data Assignment - Code

## PART 1 DATA PREPERATION AND PRE-PROCESSING

In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import re
import compress_pickle as pickle
from math import pi
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
#Get high level view of the data
baseDataFrame = pd.read_csv('data/data.csv', low_memory=False)
print("The Dataset consists of: ", baseDataFrame.shape[0], " rows and ", baseDataFrame.shape[1], " columuns")
baseDataFrame.head(10)

The Dataset consists of:  318477  rows and  13  columuns


Unnamed: 0,Id,Title,Company,Date,Location,Area,Classification,SubClassification,Requirement,FullDescription,LowestSalary,HighestSalary,JobType
0,37404348,Casual Stock Replenisher,Aldi Stores,2018-10-07T00:00:00.000Z,Sydney,North West & Hills District,Retail & Consumer Products,Retail Assistants,Our Casual Stock Replenishers pride themselves...,,0,30,
1,37404337,Casual Stock Replenisher,Aldi Stores,2018-10-07T00:00:00.000Z,Richmond & Hawkesbury,,Retail & Consumer Products,Retail Assistants,Our Casual Stock Replenishers pride themselves...,,0,30,
2,37404356,RETAIL SALES SUPERSTARS and STYLISTS Wanted - ...,LB Creative Pty Ltd,2018-10-07T00:00:00.000Z,Brisbane,CBD & Inner Suburbs,Retail & Consumer Products,Retail Assistants,BRAND NEW FLAGSHIP STORE OPENING - SUNSHINE PLAZA,,0,30,
3,37404330,Team member - Belrose,Anaconda Group Pty Ltd,2018-10-07T00:00:00.000Z,Gosford & Central Coast,,Retail & Consumer Products,Retail Assistants,Bring it on - do you love the great outdoors a...,,0,30,
4,37404308,"Business Banking Contact Centre Specialist, Ni...",Commonwealth Bank - Business & Private Banking,2018-10-07T00:00:00.000Z,Sydney,Ryde & Macquarie Park,Call Centre & Customer Service,Sales - Inbound,"We are seeking highly articulate, enthusiastic...",,0,30,
5,37404307,"Business Banking Contact Centre Specialist, Ni...",Commonwealth Bank - Business & Private Banking,2018-10-07T00:00:00.000Z,Sydney,Ryde & Macquarie Park,Call Centre & Customer Service,Customer Service - Call Centre,"We are seeking highly articulate, enthusiastic...",,0,30,
6,37404355,Casual Café All-rounder in Semaphore,Jora Local,2018-10-07T00:00:00.000Z,Adelaide,,Hospitality & Tourism,Waiting Staff,A cafe/restaurant in Semaphore is seeking a ca...,,0,30,
7,37404350,Casual Café All-rounder in Woolloongabba,Jora Local,2018-10-07T00:00:00.000Z,Brisbane,CBD & Inner Suburbs,Hospitality & Tourism,Waiting Staff,A cafe in Woolloongabba is seeking a casual Ca...,,0,30,
8,37404301,Telemarketer,Reventon Investments,2018-10-07T00:00:00.000Z,Melbourne,CBD & Inner Suburbs,Call Centre & Customer Service,Sales - Outbound,Market leading investment & financial services...,,0,30,
9,37404286,Casual Dish Hand in Brighton,Jora Local,2018-10-07T00:00:00.000Z,Melbourne,Bayside & South Eastern Suburbs,Hospitality & Tourism,Kitchen & Sandwich Hands,A restaurant in Brighton is seeking a casual D...,,0,30,


In [3]:
#Get data types
baseDataFrame.dtypes

Id                   object
Title                object
Company              object
Date                 object
Location             object
Area                 object
Classification       object
SubClassification    object
Requirement          object
FullDescription      object
LowestSalary          int64
HighestSalary         int64
JobType              object
dtype: object

In [4]:
#Fix the date data type since it is in wrong format
baseDataFrame["Date"] = baseDataFrame["Date"].replace(to_replace=r'T.*', value='', regex=True)
baseDataFrame["Date"] = pd.to_datetime(baseDataFrame["Date"])
baseDataFrame.dtypes

Id                           object
Title                        object
Company                      object
Date                 datetime64[ns]
Location                     object
Area                         object
Classification               object
SubClassification            object
Requirement                  object
FullDescription              object
LowestSalary                  int64
HighestSalary                 int64
JobType                      object
dtype: object

In [5]:
#Fix full description data type
# Clear HTML tags from Full Description if they're present
# Only uncomment when required - this code takes a significant amount of time to process.
# from bs4 import BeautifulSoup
# for row in range(0, len(baseDataFrame["FullDescription"])):
#     soup = BeautifulSoup(baseDataFrame["FullDescription"][row])
#     baseDataFrame["FullDescription"][row] = soup.get_text()
# baseDataFrame.head(200)

In [6]:
#Remove duplicates in data
baseDataFrame = baseDataFrame.drop_duplicates(subset=['Company', 'Title', 'Location', 'Area', 'Classification', 'SubClassification',
                                                     'Requirement', 'FullDescription', 'LowestSalary', 'HighestSalary', 'JobType'])
print("The Dataset consists of: ", baseDataFrame.shape[0], " rows and ", baseDataFrame.shape[1], " columuns")

The Dataset consists of:  303146  rows and  13  columuns


In [7]:
# Cull any data past the 8 digit ID and change type to ID
baseDataFrame['Id'] = baseDataFrame['Id'].apply(lambda x: x[:8])
baseDataFrame = baseDataFrame.astype({"Id": int})
baseDataFrame.dtypes

Id                            int32
Title                        object
Company                      object
Date                 datetime64[ns]
Location                     object
Area                         object
Classification               object
SubClassification            object
Requirement                  object
FullDescription              object
LowestSalary                  int64
HighestSalary                 int64
JobType                      object
dtype: object

In [8]:
print(baseDataFrame.isna().sum())

Id                        0
Title                     0
Company               11850
Date                      0
Location             111051
Area                 183994
Classification       111051
SubClassification    111051
Requirement               7
FullDescription       15408
LowestSalary              0
HighestSalary             0
JobType               15336
dtype: int64


In [9]:
#Recatorgise missing data
baseDataFrame['Company'].fillna('None', inplace=True)
baseDataFrame['Location'].fillna('None', inplace=True)
baseDataFrame['Area'].fillna('None', inplace=True)
baseDataFrame['Classification'].fillna('None', inplace=True)
baseDataFrame['Requirement'].fillna('None', inplace=True)
baseDataFrame['FullDescription'].fillna('None', inplace=True)
baseDataFrame['JobType'].fillna('None', inplace=True)

In [10]:
#Determine range of every column
# Print the domains of all non-freeform fields
# Free-form fields are Requirement and FullDescription, where each entry is expected to be unique so the domain is limitless.
print(f"ID: [{min(baseDataFrame['Id'])}, {max(baseDataFrame['Id'])}] (mathematical domain)")
print(f"Title: {baseDataFrame.Title.unique()}")
print(f"Date: {min(baseDataFrame['Date'])} to {max(baseDataFrame['Date'])} (date range)")
print(f"Location: {baseDataFrame.Location.unique()}")
print(f"Area: {baseDataFrame.Area.unique()}")
print(f"Classification: {baseDataFrame.Classification.unique()}")
print(f"Sub-Classification: {baseDataFrame.SubClassification.unique()}")
print(f"Job Type: {baseDataFrame.JobType.unique()}")
print(f"Lowest Salary: [{min(baseDataFrame['LowestSalary'])}, {max(baseDataFrame['LowestSalary'])}] (mathematical domain)")
print(f"Highest Salary: [{min(baseDataFrame['HighestSalary'])}, {max(baseDataFrame['HighestSalary'])}] (mathematical domain)")
print(f"Average Salary: [{min(baseDataFrame['AverageSalary'])}, {max(baseDataFrame['AverageSalary'])}] (mathematical domain)")
print(f"Salary Range: [{min(baseDataFrame['SalaryRange'])}, {max(baseDataFrame['SalaryRange'])}] (mathematical domain)")

ID: [31671087, 38566133] (mathematical domain)
Title: ['Casual Stock Replenisher'
 'RETAIL SALES SUPERSTARS and STYLISTS Wanted - Womens Fashion - SUNSHINE PLAZA'
 'Team member - Belrose' ... 'Web Content Writer - June 2019 contract'
 'Brand Director - Global Premium Brand' 'Head of Operations - EOI']
Date: 2018-10-01 00:00:00 to 2019-03-13 00:00:00 (date range)
Location: ['Sydney' 'Richmond & Hawkesbury' 'Brisbane' 'Gosford & Central Coast'
 'Adelaide' 'Melbourne' 'Perth' 'Mackay & Coalfields' 'Sunshine Coast'
 'Gold Coast' 'West Gippsland & Latrobe Valley'
 'Hervey Bay & Fraser Coast' 'South West Coast VIC'
 'Mornington Peninsula & Bass Coast' 'Port Hedland, Karratha & Pilbara'
 'Ballarat & Central Highlands' 'Bendigo, Goldfields & Macedon Ranges'
 'Yarra Valley & High Country' 'Coffs Harbour & North Coast'
 'Newcastle, Maitland & Hunter' 'Tumut, Snowy & Monaro' 'Darwin'
 'Toowoomba & Darling Downs' 'ACT' 'Wollongong, Illawarra & South Coast'
 'Port Macquarie & Mid North Coast' 'Sout

KeyError: 'AverageSalary'

In [None]:
for i in baseDataFrame:
    print(i)
    print(len(baseDataFrame[i].unique()))

In [None]:
#Set any of lowest salary that equals 0 to nan
#baseDataFrame['LowestSalary'] = baseDataFrame['LowestSalary'].replace([0],'NaN')
#baseDataFrame.head(10)

In [None]:
#Remove all rows that contain null data (65.2% of the data contains null values)
#baseDataFrame = baseDataFrame.dropna()

In [None]:
#print(baseDataFrame.isna().sum())

In [None]:
#Analysis range of salary
#salaryDataFrame = baseDataFrame.dropna(subset=['LowestSalary'])
#pd.to_numeric(salaryDataFrame['LowestSalary'])
#Show outliers
boxplot = baseDataFrame.boxplot(column=['HighestSalary', 'LowestSalary'])
#Remove outliers in wages
from scipy import stats
baseDataFrame = baseDataFrame[(np.abs(stats.zscore(baseDataFrame['HighestSalary'])) < 3)]
baseDataFrame = baseDataFrame[(np.abs(stats.zscore(baseDataFrame['LowestSalary'])) < 3)]

In [None]:
# Include average salary for use in analysis
baseDataFrame = baseDataFrame.assign(AverageSalary= (baseDataFrame["LowestSalary"] + baseDataFrame["HighestSalary"])/2 )

# Include salaray range for use in analysis
baseDataFrame = baseDataFrame.assign(SalaryRange= (baseDataFrame["HighestSalary"] - baseDataFrame["LowestSalary"]) )

baseDataFrame.head(10)

In [None]:
#Print high level view of data after changes
print("Data after pre-processing completed")
print("The Dataset consists of: ", baseDataFrame.shape[0], " rows and ", baseDataFrame.shape[1], " columuns")
baseDataFrame.head(10)


In [None]:
baseDataFrame.dtypes

In [None]:
baseDataFrame["Company"].value_counts

In [75]:
#Analysis missing cells
emptyFrame = baseDataFrame.loc[0:, ['Location', 'Area', 'Classification', 'SubClassification']]
emptyFrame = emptyFrame.isna()
print(emptyFrame.value_counts())
#Analysis missing cells
emptyFrame = baseDataFrame.loc[0:, ['Location', 'Area']]
emptyFrame = emptyFrame.isna()
print(emptyFrame.value_counts())

Location  Area   Classification  SubClassification
False     False  False           False                192095
                                 True                 111051
dtype: int64
Location  Area 
False     False    303146
dtype: int64


## PART 2 Data Analysis

### Job Meta Data

In [None]:
from sklearn import datasets
import matplotlib.pyplot as plt

In [None]:
#Extract information about the job sectors
#Drop NaN values in classifcation
classDataFrame = baseDataFrame.dropna(subset=['Classification', 'SubClassification'])
print("There is", len(baseDataFrame["Classification"].unique()), "different sectors")
print("These sectors contain", len(baseDataFrame["SubClassification"].unique()), "sub sectors")


count = {}
for x in classDataFrame.groupby(['Classification','SubClassification']).size().reset_index()['Classification']:
    if(x in count):
        count[x] = count[x] + 1
    else:
        count[x] = 1
largestValue = 0
largestSector = "";
for x in count:
    if(count[x] > largestValue):
        largestValue = count[x];
        largestSector = x;
print(largestSector, "has the most subsectors with ", largestValue, "subsectors")

In [None]:
#Extract information about the locations
#Drop empty data
locationDataFrame =  baseDataFrame
print("There is a total of",len(baseDataFrame["Location"].unique()), "locations" )
value_bins = baseDataFrame["Location"].value_counts()
top_10_Locations = value_bins[0:10]
print("The top 10 most common locations are:")
print(top_10_Locations)
print("The top 6 are all capital cities but no darwin")

In [None]:
boxplot = baseDataFrame.boxplot(column=['HighestSalary', 'LowestSalary'])

In [None]:
baseDataFrame.groupby(['HighestSalary']).size()

In [None]:
baseDataFrame.groupby(['LowestSalary']).size()

In [None]:
#Get the total average salalry
print("Overal average wage: ", baseDataFrame['AverageSalary'].mean())

In [None]:
#Analysis the dates of the posting
print("Latest Posting is:", baseDataFrame["Date"].max())
print("Earlisest Posting is:", baseDataFrame["Date"].min())

In [None]:
#Visualise jobs over time
dataFrame = baseDataFrame.dropna(subset=['Date'])
dataFrame = dataFrame.loc[0:, ['Date']]
dataFrame = dataFrame.value_counts().reset_index(name='counts')
dataFrame.set_index('Date').plot()

In [None]:
#Analysis postings by date
dataFrame =  baseDataFrame.dropna(subset=['Date'])
dataFrame['Date'] = dataFrame['Date'].dt.strftime('%d')
dataFrame = dataFrame.loc[0:, ['Date']].sort_values(by=['Date'])
dataFrame['Freq'] = dataFrame.groupby('Date')['Date'].transform('count')
dataFrame = dataFrame.drop_duplicates()
ax = dataFrame.plot.scatter(x='Date', y='Freq')

dates = []
for i in range(1,32):
    dates.insert(len(dates),i)    

d = np.polyfit(dates, dataFrame['Freq'],1)
f = np.poly1d(d)
dataFrame.insert(2,'Treg',f(dates))
dataFrame.plot(x='Date', y='Treg', color='Red', legend=False,ax=ax)

In [None]:
#Analysis the salary over time
dataFrame =  baseDataFrame.dropna(subset=['Date', 'AverageSalary'])
dataFrame =  dataFrame.loc[0:, ['Date','AverageSalary']]
dataFrame.groupby(['Date'])['AverageSalary'].mean().plot()

In [None]:
#Analysis job titles
print("There is", len(baseDataFrame["Title"].unique()), "unique job titles")
print("at", len(baseDataFrame["Company"].unique()), "companies\n")
print("The top 10 job titles are: ")
value_bins = baseDataFrame["Title"].value_counts()
top_10_title = value_bins[0:10]
print(top_10_title, "\n")
print("The top 10 companies with the most jobs are: ")
value_bins = baseDataFrame["Company"].value_counts()
top_10_Company = value_bins[0:10]
print(top_10_Company, "\n")
print("The top 10 sectors are: ")
value_bins = baseDataFrame["Classification"].value_counts()
top_10_Classification = value_bins[0:10]
print(top_10_Classification, "\n")

In [None]:
#Analysis job type
typeDataFrame =  baseDataFrame.dropna(subset=['JobType'])
print("There are", len(typeDataFrame['JobType'].unique()), "job types")
print("These are: ",typeDataFrame['JobType'].unique())
print("The amount of each are:")
print(typeDataFrame['JobType'].value_counts())

value_bins = typeDataFrame['JobType'].value_counts()
value_bins.plot.pie(y='JobType', figsize=(20, 20))

In [None]:
#Analysis the salary over time
dataFrame =  baseDataFrame.dropna(subset=['JobType', 'AverageSalary'])
dataFrame =  dataFrame.loc[0:, ['JobType','AverageSalary']]
dataFrame.groupby(['JobType'])['AverageSalary'].mean()

### Market by Locations

In [None]:
#Find the common sector for each location
dataFrame =  baseDataFrame.dropna(subset=['Location', 'Classification'])
dataFrame =  dataFrame.loc[0:, ['Location','Classification']]
dataFrame = dataFrame.value_counts().reset_index(name='count')

dataFrame = dataFrame.groupby(['Location','Classification']).agg({'count':sum})

g = dataFrame['count'].groupby('Location', group_keys=False) 
largest = g.nlargest(1)
print(largest)


In [None]:
#order locations by amount of jobs
dataFrame =  baseDataFrame.loc[0:, ['Location']]
dataFrame['Counts'] = dataFrame.Location.groupby(dataFrame.Location).transform('count')
dataFrame = dataFrame.drop_duplicates(subset=['Location'])

In [None]:
#Sort locations by average salary
dataFrame =  baseDataFrame.dropna(subset=['Location', 'AverageSalary'])
dataFrame =  dataFrame.loc[0:, ['Location','AverageSalary']]
for index, row in dataFrame.groupby('Location', as_index=False)['AverageSalary'].mean().sort_values(by=['AverageSalary'], ascending=False).iterrows():
    print(row['Location'], "average wage: " ,row['AverageSalary'])

In [None]:
dataFrame =  baseDataFrame.dropna(subset=['Location', 'AverageSalary'])
dataFrame =  dataFrame.loc[0:, ['Location','AverageSalary']]
dataFrame.groupby('Location', as_index=False)['AverageSalary'].mean().plot.bar(x='Location', y='AverageSalary', rot=90, figsize=(20,20))

### Market by Sectors

In [None]:
#Get share of the market by sectorss
dataFrame = baseDataFrame.dropna(subset=['Classification'])
dataFrame = dataFrame.loc[0:, ['Classification']]
value_bins = dataFrame["Classification"].value_counts()
value_bins.plot.pie(y='Classification', figsize=(20, 20))

In [None]:
#Find the Highest paying sector
dataFrame = baseDataFrame.dropna(subset=['Classification', 'AverageSalary'])
dataFrame =  dataFrame.loc[0:, ['Classification','AverageSalary']]
for index, row in dataFrame.groupby('Classification', as_index=False)['AverageSalary'].mean().sort_values(by=['AverageSalary'], ascending=False).iterrows():
    print(row['Classification'], "average wage: " ,row['AverageSalary'])

In [None]:
#Compare average salary for each sector
dataFrame.groupby('Classification', as_index=False)['AverageSalary'].mean().plot.bar(x='Classification', y='AverageSalary', rot=90, figsize=(20,20))

In [None]:
#Find the highest paying subclassification in each classification
dataFrame =  baseDataFrame.dropna(subset=['Classification', 'AverageSalary'])
dataFrame =  dataFrame.loc[0:, ['Classification','SubClassification','AverageSalary']]

dataFrame = dataFrame.groupby(['Classification','SubClassification']).mean()

g = dataFrame['AverageSalary'].groupby('Classification', group_keys=False) 
largest = g.nlargest(1)
print(largest)

In [None]:
#Find the Highest paying subsector
dataFrame = baseDataFrame.dropna(subset=['SubClassification', 'AverageSalary'])
dataFrame =  dataFrame.loc[0:, ['SubClassification','AverageSalary']]
for index, row in dataFrame.groupby('SubClassification', as_index=False)['AverageSalary'].mean().sort_values(by=['AverageSalary'], ascending=False).iterrows():
    print(row['SubClassification'], "average wage: " ,row['AverageSalary'])

### Interactive Web Page

In [68]:
from bokeh.plotting import figure, output_file, show
from bokeh.plotting import output_notebook
from bokeh.palettes import Category20c
from bokeh.transform import cumsum
from bokeh.models import ColumnDataSource, CustomJS, Slider
from bokeh.models.widgets import Select
from bokeh.models.widgets import Select
from bokeh.layouts import column, row
from bokeh.io import curdoc
from bokeh.models.callbacks import CustomJS
from bokeh.models.ranges import FactorRange

import yaml
output_notebook()

In [13]:
#Interactive graph of job types
#Analysis job type
typeDataFrame =  baseDataFrame.dropna(subset=['JobType'])
value_bins = typeDataFrame['JobType'].value_counts()

data = pd.Series(value_bins).reset_index(name='value').rename(columns={'index':'JobType'})
data['angle'] = data['value']/data['value'].sum() * 2*pi
data['color'] = Category20c[len(value_bins)]

p = figure(plot_height=350, title="Total Job Types", toolbar_location=None,
           tools="hover", tooltips="@JobType: @value", x_range=(-0.5, 1.0))

p.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', legend_field='JobType', source=data)

p.axis.axis_label=None
p.axis.visible=False
p.grid.grid_line_color = None

show(p)

In [69]:
d1 = {'time': [1,2,3,4], 'y': [2,1,1,8]}
d2 = {'time': [1,2,3,4,5], 'y': [2,1,1,8,22]}


source = ColumnDataSource(d1 )

print(source.data)


p = figure()
r = p.vbar(x='time', top='y', width=1,
         source = source)

    
callback = CustomJS(args=dict(source=source, d1=d1,d2=d2), code="""
   if (cb_obj.value=="d1"){
        source.data = d1 
   }
   if (cb_obj.value=="d2"){
        source.data = d2
   }
   source.change.emit();
""")

select = Select(title="monthly csv-s",  options=['d1', 'd2'])
select.js_on_change('value', callback)

layout = column(row(select, width=400), p)
curdoc().add_root(layout)

show(layout)

{'time': [1, 2, 3, 4], 'y': [2, 1, 1, 8]}


In [95]:
locationFrame = baseDataFrame.loc[0:, ['Location']]
locationFrame = locationFrame.value_counts().reset_index(name='Count')
locationFrame = locationFrame.rename(columns={"Location": "Type"}) 
locationFrame = locationFrame.head(10)


sectorsFrame = baseDataFrame.loc[0:, ['Classification']]
sectorsFrame = sectorsFrame.value_counts().reset_index(name='Count')
sectorsFrame = sectorsFrame.rename(columns={"Classification": "Type"}) 
sectorsFrame = sectorsFrame.head(10)

locationFrameDict = {}
typeArr = []
countArr = []
for index, rowV in locationFrame.iterrows():
    typeArr.insert(0, rowV['Type'])
    countArr.insert(0, rowV['Count'])
    
locationFrameDict['Type'] = typeArr
locationFrameDict['Count'] = countArr


sectorsFrameDict = {}
typeArr = []
countArr = []
for index, rowV in sectorsFrame.iterrows():
    typeArr.insert(0, rowV['Type'])
    countArr.insert(0, rowV['Count'])
    
sectorsFrameDict['Type'] = typeArr
sectorsFrameDict['Count'] = countArr
    

source = ColumnDataSource(locationFrameDict)
p = figure(x_range=locationFrameDict['Type'])
p.xaxis.major_label_orientation = pi/2


r = p.vbar(x='Type', top='Count', width=1,
         source = source)

    
callback = CustomJS(args=dict(p=p,source=source, locationFrameDict=locationFrameDict,sectorsFrameDict=sectorsFrameDict), code="""
   if (cb_obj.value=="Location"){
        source.data = locationFrameDict
        p.x_range.factors = locationFrameDict['Type'];
   }
   if (cb_obj.value=="Sector"){
        source.data = sectorsFrameDict
        p.x_range.factors = sectorsFrameDict['Type'];
   }
""")


select = Select(title="Total number of jobs",  options=['Location', 'Sector'])
select.js_on_change('value', callback)

layout = column(row(select, width=400), p)
curdoc().add_root(layout)

show(layout)

In [76]:
item = pickle.load('SkillsbyClassification/skills_Accounting.lz4')




df = pd.DataFrame(data=item[1])

source = ColumnDataSource(locationFrameDict)
p = figure(x_range=locationFrameDict['Names'])


r = p.vbar(x='Names', top='Number', width=1,
         source = source)

    
callback = CustomJS(args=dict(source=source, locationFrameDict=locationFrameDict,sectorsFrameDict=sectorsFrameDict), code="""
   if (cb_obj.value=="Location"){
        source.data = locationFrameDict
   }
   if (cb_obj.value=="Sector"){
        source.data = sectorsFrameDict
   }
   source.change.emit();
""")

select = Select(title="Total number of jobs",  options=['Location', 'Sector'])
select.js_on_change('value', callback)

layout = column(row(select, width=400), p)
curdoc().add_root(layout)

show(layout)

KeyError: 'Names'