# Working on weather data for a project

[Citrics](https://b.citrics.dev/) is a project that helps people decide before moving to a new city by providing them valuable informations on different cities. One of the core features of the project is being able to get information about top job industries of different cities and compare them. This notebook shows how the data was cleaned, wrangled and new features were created so that they can be used for getting job industry insights.

The data were collected from [Bureau of Labor Statistics (BLS)](https://www.bls.gov/oes/)

In [None]:
import pandas as pd
import numpy as np

In [None]:
# The first dataset is from BLS
# The second dataset contains unique identifiers
# for the 100 cities we worked on

df = pd.read_csv("/content/us_job_industry_data_2019.csv")
df_cities = pd.read_csv("/content/100city_state_data.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
df.head()

Unnamed: 0,area,area_title,area_type,naics,naics_title,i_group,own_code,occ_code,occ_title,o_group,tot_emp,emp_prse,jobs_1000_orig,loc_quotient,pct_total,h_mean,a_mean,mean_prse,h_pct10,h_pct25,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,annual,hourly
0,99,U.S.,1,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,major,8054120,0.2,,,,58.88,122480,0.1,24.03,34.35,50.8,74.16,#,49990,71460,105660,154260,#,,
1,99,U.S.,1,0,Cross-industry,cross-industry,1235,13-0000,Business and Financial Operations Occupations,major,8183750,0.2,,,,37.56,78130,0.2,18.76,25.06,33.57,45.61,60.60,39020,52130,69820,94870,126040,,
2,99,U.S.,1,0,Cross-industry,cross-industry,1235,15-0000,Computer and Mathematical Occupations,major,4552880,0.4,,,,45.08,93760,0.5,21.79,30.22,42.47,57.47,73.08,45320,62850,88340,119550,152010,,
3,99,U.S.,1,0,Cross-industry,cross-industry,1235,17-0000,Architecture and Engineering Occupations,major,2592680,0.5,,,,42.69,88800,0.3,21.77,29.28,39.15,52.87,68.56,45280,60910,81440,109970,142610,,
4,99,U.S.,1,0,Cross-industry,cross-industry,1235,19-0000,"Life, Physical, and Social Science Occupations",major,1288920,0.7,,,,37.28,77540,0.4,17.62,23.73,32.77,46.24,61.59,36640,49360,68160,96180,128100,,


In [None]:
# Cleaning up some area titles

df = df.replace(['Phoenix-Mesa-Scottsdale, AZ', 'Los Angeles-Long Beach-Anaheim, CA', 'Killeen-Temple, TX', 'Virginia Beach-Norfolk-Newport News, VA-NC'], ['Phoenix-Mesa-Scottsdale-Chandler-Gilbert, AZ', 'Los Angeles-Long Beach-Anaheim-Glendale-Santa Ana, CA', 'Killeen-Temple-Irving-Plano, TX', 'Virginia Beach-Norfolk-Newport News-Chesapeake, VA-NC'])

In [None]:
df.shape, df_cities.shape

((395647, 30), (100, 4))

In [None]:
# Restricting area type to city

df = df[ df["area_type"] == 4]

In [None]:
df.shape, df_cities.shape

((142410, 30), (100, 4))

In [None]:
# A function to create city names from area title

def create_cities(areaname):
    #print(areaname)
    areaname = areaname.replace("--","-")
    cities, garbage1 = areaname.split(",")
    areaname = cities.split("-")
    #print(f"{cities} ---- {list_cities}")
    return areaname

In [None]:
# Applying the function

df['city'] = df.area_title.apply(create_cities)
df.head()

Unnamed: 0,area,area_title,area_type,naics,naics_title,i_group,own_code,occ_code,occ_title,o_group,tot_emp,emp_prse,jobs_1000_orig,loc_quotient,pct_total,h_mean,a_mean,mean_prse,h_pct10,h_pct25,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,annual,hourly,city
37227,10180,"Abilene, TX",4,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,major,2670,5.8,40.137,0.73,,43.51,90500,2.2,17.86,27.22,37.51,51.88,75.57,37140,56610,78030,107910,157190,,,[Abilene]
37228,10180,"Abilene, TX",4,0,Cross-industry,cross-industry,1235,13-0000,Business and Financial Operations Occupations,major,2280,6.3,34.214,0.61,,32.46,67520,2.9,16.39,22.11,29.47,38.38,48.95,34090,46000,61300,79840,101820,,,[Abilene]
37229,10180,"Abilene, TX",4,0,Cross-industry,cross-industry,1235,15-0000,Computer and Mathematical Occupations,major,630,8.5,9.494,0.31,,30.27,62970,3.1,16.59,21.44,27.31,36.98,49.45,34510,44600,56810,76910,102860,,,[Abilene]
37230,10180,"Abilene, TX",4,0,Cross-industry,cross-industry,1235,17-0000,Architecture and Engineering Occupations,major,770,16.0,11.6,0.66,,34.83,72440,4.6,16.92,23.82,32.11,43.51,56.79,35200,49540,66790,90500,118110,,,[Abilene]
37231,10180,"Abilene, TX",4,0,Cross-industry,cross-industry,1235,19-0000,"Life, Physical, and Social Science Occupations",major,330,11.5,5.033,0.57,,28.06,58360,3.1,16.97,20.27,25.94,33.07,41.27,35300,42170,53960,68790,85850,,,[Abilene]


In [None]:
df_explode = df.explode('city')


In [None]:
df_explode.shape

(241969, 31)

In [None]:
df_explode.columns

Index(['area', 'area_title', 'area_type', 'naics', 'naics_title', 'i_group',
       'own_code', 'occ_code', 'occ_title', 'o_group', 'tot_emp', 'emp_prse',
       'jobs_1000_orig', 'loc_quotient', 'pct_total', 'h_mean', 'a_mean',
       'mean_prse', 'h_pct10', 'h_pct25', 'h_median', 'h_pct75', 'h_pct90',
       'a_pct10', 'a_pct25', 'a_median', 'a_pct75', 'a_pct90', 'annual',
       'hourly', 'city'],
      dtype='object')

In [None]:
df_explode[['area_title', 'city']].head()

Unnamed: 0,area_title,city
37227,"Abilene, TX",Abilene
37228,"Abilene, TX",Abilene
37229,"Abilene, TX",Abilene
37230,"Abilene, TX",Abilene
37231,"Abilene, TX",Abilene


In [None]:
missing = ['Chandler', 'Gilbert', 'Glendale', 'Santa Ana', 'Honolulu', 'Boise', 'Louisville', 'Winston Salem', 'Irving', 'Plano', 'Chesapeake']
sorted(missing)

['Boise',
 'Chandler',
 'Chesapeake',
 'Gilbert',
 'Glendale',
 'Honolulu',
 'Irving',
 'Louisville',
 'Plano',
 'Santa Ana',
 'Winston Salem']

In [None]:
len(df_explode.city.unique())

564

In [None]:
len(df_explode.area_title.unique())

396

In [None]:
# Cleaning up some more city names

df_explode = df_explode.replace(['Boise City', 'Louisville/Jefferson County', 'Urban Honolulu', 'Winston'], ['Boise', 'Louisville', 'Honolulu', 'Winston Salem'])

In [None]:
# merging with the dataframe with unique identifier

merged = df_cities.merge(df_explode, on="city", how="left")

In [None]:
merged.head()

Unnamed: 0,city_id,city,state,city_state,area,area_title,area_type,naics,naics_title,i_group,own_code,occ_code,occ_title,o_group,tot_emp,emp_prse,jobs_1000_orig,loc_quotient,pct_total,h_mean,a_mean,mean_prse,h_pct10,h_pct25,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,annual,hourly
0,0,Anchorage,AK,"Anchorage, AK",11260,"Anchorage, AK",4,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,major,11830,3.1,67.816,1.24,,53.69,111680,1.1,25.9,35.17,48.07,63.2,87.82,53860,73150,99970,131450,182670,,
1,0,Anchorage,AK,"Anchorage, AK",11260,"Anchorage, AK",4,0,Cross-industry,cross-industry,1235,13-0000,Business and Financial Operations Occupations,major,7920,2.9,45.376,0.81,,35.98,74840,4.4,15.08,25.79,34.26,44.87,55.86,31360,53650,71270,93320,116180,,
2,0,Anchorage,AK,"Anchorage, AK",11260,"Anchorage, AK",4,0,Cross-industry,cross-industry,1235,15-0000,Computer and Mathematical Occupations,major,3280,8.9,18.769,0.61,,39.3,81740,2.2,20.86,27.21,37.11,48.15,60.2,43380,56600,77180,100140,125210,,
3,0,Anchorage,AK,"Anchorage, AK",11260,"Anchorage, AK",4,0,Cross-industry,cross-industry,1235,17-0000,Architecture and Engineering Occupations,major,4440,6.4,25.447,1.44,,49.13,102180,2.7,26.3,35.33,46.06,59.47,75.06,54710,73480,95790,123700,156110,,
4,0,Anchorage,AK,"Anchorage, AK",11260,"Anchorage, AK",4,0,Cross-industry,cross-industry,1235,19-0000,"Life, Physical, and Social Science Occupations",major,2930,6.7,16.783,1.91,,38.72,80540,3.0,21.06,27.87,36.41,47.04,58.97,43810,57970,75720,97830,122660,,


In [None]:
merged.shape

(62282, 34)

In [None]:
merged.isnull().sum()

city_id               0
city                  0
state                 0
city_state            0
area                  0
area_title            0
area_type             0
naics                 0
naics_title           0
i_group               0
own_code              0
occ_code              0
occ_title             0
o_group               0
tot_emp               0
emp_prse              0
jobs_1000_orig        0
loc_quotient       4888
pct_total         62282
h_mean                0
a_mean                0
mean_prse             0
h_pct10               0
h_pct25               0
h_median              0
h_pct75               0
h_pct90               0
a_pct10               0
a_pct25               0
a_median              0
a_pct75               0
a_pct90               0
annual            58201
hourly            62056
dtype: int64

In [None]:
len(merged.city.unique())

100

In [None]:
# Downloading the data

merged.to_csv('test1.csv')

To see how the endpoints were created using this data click [here](https://colab.research.google.com/drive/1IJD9aRTVFZiIvYKZztM2xzUEbmkM3Y6E?usp=sharing)