In [None]:
import pandas as pd

In [None]:
data = pd.read_excel("../private_data/data/survey_results_aggregated/data.xlsx")

In [None]:
#
# clean country values
#
data['job_country'] = data['job_country'].str.strip()
data['search_country'] = data['search_country'].str.strip()

country_dict = {
  "España, Presencial en Sevilla": "Spain",
  "españa": "Spain",
  "Paris,France": "France",
  "germany": "Germany",
  "United Kingdom": "UK",
  "The Netherlands": "Netherlands",
  "Netherlands, but also applying to jobs in the UK and Europe": "Netherlands",
  "Emea": "Other",
  "UK, Ireland, France, Spain": "Other",
  "Switzerland and France": "Other",
  "Europe": "Other",
}
data['job_country'] = data['job_country'].replace(country_dict)
data['search_country'] = data['search_country'].replace(country_dict)

In [None]:
#
# convert cohort to string and remove all but year and month
#
def clean_cohort(string):
  return str(string)[0:7]

data["cohort_start_month"] = data["cohort_start_month"].apply(clean_cohort)

In [None]:
#
# add simple boolean if students found a developer job
#
def add_dev_job_situation(response):
  if  (response == "I found a Full-Time job as a developer (or a similar role)" 
      or response == "I found a Part-Time job as a developer (or a similar role)" 
      or response == "I found work as a Freelance developer, with a relevant source of income"):
    return True
  else:
    return False
    

data["has_dev_job"] = data["post_bootcamp_situation"].apply(add_dev_job_situation)

In [None]:
#
# change Yes/No questions to booleans
#
update_bool_dict = {"Yes": True, "No": False}

data["did_bootcamp_helped"] = data["did_bootcamp_helped"].replace(update_bool_dict)
data["did_create_portfolio"] = data["did_create_portfolio"].replace(update_bool_dict)


In [None]:
#
# update values for number of projects
#

new_project_qty_dict = {
  "No, I haven't": "No Projects", 
  'Yes, I created 1': "1 Project",
  'Yes, I created 2 to 3': "2 to 3 Projects",
  'Yes, I created more than 3': "More than 3 Projects"
}

data["new_projects_qty"] = data["new_projects_qty"].replace(new_project_qty_dict)

In [None]:
#
# update values for number of interviews
#

data["company_interview_qty"] = data["company_interview_qty"].replace({
  "I didn't have interviews or just very short ones": "0 companies"
})

In [None]:
# 
# compute total number of hours (total_hours = job_search_week_hours + coding_week_hours)
# 

map_hours_to_number = {
    'Less than 5': 2.5,
    'Between 5 and 10': 7.5,
    'Between 10 and 20': 15,
    'Between 20 and 30': 25,
    'More than 30': 35
}

def compute_total_hours(row):
    hours_searching = map_hours_to_number[row["job_search_week_hours"]]
    hours_coding = map_hours_to_number[row["coding_week_hours"]]
    return hours_searching + hours_coding


if "total_hours" not in data.columns:
    col_to_insert_after = "coding_week_hours"
    idx = data.columns.get_loc(col_to_insert_after) + 1
    data.insert(idx, "total_hours", data.apply(compute_total_hours, axis=1))


In [None]:
# 
# compute workload_balance_diff (the difference between job_search_week_hours and coding_week_hours)
# 
# Note: we'll calculate it as the difference in buckets (rather than estimated hours)
# 

map_hours_to_index = {
    'Less than 5': 0,
    'Between 5 and 10': 1,
    'Between 10 and 20': 2,
    'Between 20 and 30': 3,
    'More than 30': 4
}

def compute_diff(row):
    hours_searching_index = map_hours_to_index[row["job_search_week_hours"]]
    hours_coding_index = map_hours_to_index[row["coding_week_hours"]]
    diff = hours_searching_index - hours_coding_index # negative == more hours coding
    if(diff == 0):
        return 0
    elif(diff < 0):
        return -1
    elif(diff > 0):
        return 1
    return diff


if "workload_balance_diff" not in data.columns:
    col_to_insert_after = 'total_hours'
    idx = data.columns.get_loc(col_to_insert_after) + 1
    data.insert(idx, "workload_balance_diff", data.apply(compute_diff, axis=1))


In [None]:

data.to_excel("../private_data/data/cleaned_data/cleaned_data.xlsx")