In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

In [2]:
df = pd.read_csv("/content/us_job_industry_data_2019.csv")


Columns (3,12,14) have mixed types.Specify dtype option on import or set low_memory=False.



In [3]:
# TO DO: MAP CITIES TO MASTER LIST

In [4]:
def wrangle(X):
  """
  Wrangles and cleans dataframe
  """

 
  # Creating 2 copies to handle numeric and non-numeric data
  numeric = X.copy()
  non_numeric = X.copy()

  # Filtering dataframe to retain relevant numeric columns
  numeric = numeric.filter(["tot_emp", "jobs_1000_orig", "loc_quotient", "h_mean",	
                "a_mean",	"h_pct25",	"h_median",	"h_pct75",	"h_pct90",	
                "a_pct25",	"a_median",	"a_pct75", "a_pct90"], axis=1)

  # Renaming columns
  numeric = numeric.rename(columns={"tot_emp":"Total Employed", 
                                    "jobs_1000_orig":"Jobs per 1000", 
                                    "loc_quotient":"Job Dilution", 
                                    "h_mean":"Hourly Wage Mean",	
                                    "a_mean":"Annual Wage Mean",	
                                    "h_pct25":"Hourly Wage (25th Percentile)",	
                                    "h_median":"Hourly Wage (Median)",	
                                    "h_pct75":"Hourly Wage (75th Percentile)",	
                                    "h_pct90":"Hourly Wage (90th Percentile)",	
                                    "a_pct25":"Annual Wage (25th Percentile)",	
                                    "a_median":"Annual Wage (Median)",	
                                    "a_pct75":"Annual Wage (75th Percentile)",	
                                    "a_pct90":"Annual Wage (90th Percentile)"})

  # Replacing NaN values with 0s
  numeric = numeric.replace(np.nan, 0)
  numeric = numeric.replace("*", 0)
  numeric = numeric.replace("**", 0)
  numeric["Hourly Wage Mean"] = numeric["Hourly Wage Mean"].replace("#", 100)
  numeric["Annual Wage Mean"] = numeric["Annual Wage Mean"].replace("#", 208000)
  numeric["Hourly Wage (25th Percentile)"] = numeric["Hourly Wage (25th Percentile)"].replace("#", 100)
  numeric["Hourly Wage (Median)"] = numeric["Hourly Wage (Median)"].replace("#", 100)
  numeric["Hourly Wage (75th Percentile)"] = numeric["Hourly Wage (75th Percentile)"].replace("#", 100)
  numeric["Hourly Wage (90th Percentile)"] = numeric["Hourly Wage (90th Percentile)"].replace("#", 100)
  numeric["Annual Wage (25th Percentile)"] = numeric["Annual Wage (25th Percentile)"].replace("#", 208000)
  numeric["Annual Wage (Median)"] = numeric["Annual Wage (Median)"].replace("#", 208000)
  numeric["Annual Wage (75th Percentile)"] = numeric["Annual Wage (75th Percentile)"].replace("#", 208000)
  numeric["Annual Wage (90th Percentile)"] = numeric["Annual Wage (90th Percentile)"].replace("#", 208000)
  numeric = numeric.replace(",", "", regex=True)

  # Converting data to numbers for easy visualization
  numeric = numeric.astype(float)

  # Creating job sector percentage column
  numeric["Job Sector Percentage"] = numeric["Jobs per 1000"] / 10


  # Handling non-numeric data
  non_numeric[["area_title", "area_state"]] = non_numeric["area_title"].str.split(",", expand=True)
  non_numeric = non_numeric.filter(["area_title", "area_state", "occ_title"])

  non_numeric = non_numeric.rename(columns={"area_title":"City", "area_state":"State", "occ_title":"Job Sector"})

  # Resetting indices to concatenate
  numeric.reset_index(drop=True, inplace=True)
  non_numeric.reset_index(drop=True, inplace=True)

  return pd.concat([non_numeric, numeric], axis=1)

In [5]:
# Condensing df to only include statistics at city level

df_wrangle = df[ df["area_type"] == 4]

In [6]:
df_wrangle = wrangle(df_wrangle)
df_wrangle.head()

Unnamed: 0,City,State,Job Sector,Total Employed,Jobs per 1000,Job Dilution,Hourly Wage Mean,Annual Wage Mean,Hourly Wage (25th Percentile),Hourly Wage (Median),Hourly Wage (75th Percentile),Hourly Wage (90th Percentile),Annual Wage (25th Percentile),Annual Wage (Median),Annual Wage (75th Percentile),Annual Wage (90th Percentile),Job Sector Percentage
0,Abilene,TX,Management Occupations,2670.0,40.137,0.73,43.51,90500.0,27.22,37.51,51.88,75.57,56610.0,78030.0,107910.0,157190.0,4.0137
1,Abilene,TX,Business and Financial Operations Occupations,2280.0,34.214,0.61,32.46,67520.0,22.11,29.47,38.38,48.95,46000.0,61300.0,79840.0,101820.0,3.4214
2,Abilene,TX,Computer and Mathematical Occupations,630.0,9.494,0.31,30.27,62970.0,21.44,27.31,36.98,49.45,44600.0,56810.0,76910.0,102860.0,0.9494
3,Abilene,TX,Architecture and Engineering Occupations,770.0,11.6,0.66,34.83,72440.0,23.82,32.11,43.51,56.79,49540.0,66790.0,90500.0,118110.0,1.16
4,Abilene,TX,"Life, Physical, and Social Science Occupations",330.0,5.033,0.57,28.06,58360.0,20.27,25.94,33.07,41.27,42170.0,53960.0,68790.0,85850.0,0.5033


In [7]:
def pie(df, city, n_industries):

  df = wrangle(df)
  df_city_top10 = df[ df["City"] == city].sort_values(by="Job Sector Percentage", ascending=False)[1:n_industries + 1]
  df_city_other = df[ df["City"] == city].sort_values(by="Job Sector Percentage", ascending=False)[n_industries + 1:]


  top_10_labels = df_city_top10["Job Sector"]
  top_10_values = df_city_top10["Job Sector Percentage"]

  df_top10_aggregate = pd.DataFrame({"Job Sector": top_10_labels,
                                     "Job Sector Percentage": top_10_values})
  
  df_city_other = pd.DataFrame({"Job Sector": ["Other"],
                                "Job Sector Percentage": [100 - sum(top_10_values)]})

  df_combined = pd.concat([df_top10_aggregate, df_city_other])

  fig = go.Figure(data=[go.Pie(labels=df_combined["Job Sector"], values=df_combined["Job Sector Percentage"], textinfo="label+percent", hole=.3)])
  fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))

  return fig.show()

In [8]:
df_pie = df[ df["area_type"] == 4]

In [9]:
pie(df_pie, "Danville", 10)