In [None]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../scripts/eda/")))
from render_group_by_column_vs_has_dev_job import render_group_by_column_vs_has_dev_job

In [None]:
data = pd.read_excel("../private_data/data/cleaned_data/cleaned_data.xlsx")

## Filter students that found a job but it's not related to the bootcamp

We'll considering only students that found a dev job AND students that did not find a job. <br>

ie. for the analysis, we're not considering students that found a job but it's not related to the bootcamp (at least for now, since we don't have relevant data about them)



In [None]:
#
# IMPORTANT: for this analysis we're using only students that found a dev job AND students that didn't find a job.
#
# (ie. we're not considering students that found a job but it's not related to the bootcamp)
#

data = data[(data["post_bootcamp_situation"] != "I found a job, but is not related to the bootcamp")]



## EDA

In [None]:
#
# students that spend more time applying for jobs, have higher chances of getting a job.
#

display(Markdown("<br>"))
display(Markdown("### Hours searching"))
data_to_analyze = data
render_group_by_column_vs_has_dev_job(data_to_analyze, "job_search_week_hours")


In [None]:
#
# students that spend more time coding, have higher chances of getting a job.
#

# all students
display(Markdown("<br>"))
display(Markdown("### Hours coding (all students)"))
data_to_analyze = data
render_group_by_column_vs_has_dev_job(data_to_analyze, "coding_week_hours")



# only recent students
# display(Markdown("<br>"))
# display(Markdown("### Hours coding (only recent students)"))
# students_2023_and_2024 = data[data["cohort_start_month"].str.startswith(("2023", "2024"))]
# render_group_by_column_vs_has_dev_job(students_2023_and_2024, "coding_week_hours")

In [None]:
#
# students that spend more time in total (searching + coding), have higher chances of getting a job.
#

# all students
display(Markdown("<br>"))
display(Markdown("### Total hours (searching + coding)"))
data_to_analyze = data
render_group_by_column_vs_has_dev_job(data_to_analyze, "total_hours")


# only recent students
# display(Markdown("<br>"))
# display(Markdown("### Total hours (searching + coding) (only recent students)"))
# students_2023_and_2024 = data[data["cohort_start_month"].str.startswith(("2023", "2024"))]
# render_group_by_column_vs_has_dev_job(students_2023_and_2024, "total_hours")


In [None]:
#
# students that keep a good balance between hours searching & hours coding, have higher chances of getting a job.
#

# all students
display(Markdown("<br>"))
display(Markdown("### Workload balance"))
display(Markdown("- zero == similar workload (ie. spent a similar amount of time searching than coding)"))
display(Markdown("- negative == more hours coding"))
display(Markdown("- positive == more hours searching"))
data_to_analyze = data
render_group_by_column_vs_has_dev_job(data_to_analyze, "workload_balance_diff")

# # only students with a minimum workload
# display(Markdown("<br>"))
# display(Markdown("### Workload balance (only students with a minimum workload)"))
# students_with_min_workload = data[data["total_hours"] >= 15]
# render_group_by_column_vs_has_dev_job(students_with_min_workload, "workload_balance_diff")


# # only recent students
# display(Markdown("<br>"))
# display(Markdown("### Workload balance (only recent students)"))
# students_2023_and_2024 = data[data["cohort_start_month"].str.startswith(("2023", "2024"))]
# render_group_by_column_vs_has_dev_job(students_2023_and_2024, "workload_balance_diff")




In [None]:
#
# time to job per job search hours
#
data.groupby("time_to_job").agg({"job_search_week_hours": "count"})

students_that_found_dev_job = data[data["has_dev_job"] == True]

result = students_that_found_dev_job.groupby("time_to_job").agg(
  less_than_5=("job_search_week_hours", lambda x: (x == "Less than 5").sum()),
  between_5_and_10=("job_search_week_hours", lambda x: (x == "Between 5 and 10").sum()),
  between_10_and_20=("job_search_week_hours", lambda x: (x == "Between 10 and 20").sum()),
  between_20_and_30=("job_search_week_hours", lambda x: (x == "Between 20 and 30").sum()),
  more_than_30=("job_search_week_hours", lambda x: (x == "More than 30").sum())
)

categories = [
  'Less than 2 months',
  '3-4 months',
  '5-6 months',
  'More than 6 months'
]

result.index = pd.CategoricalIndex(
  result.index,
  categories=categories,
  ordered=True
  )

# Sort by the index (which is now an ordered categorical)
result = result.sort_index()

plt.figure(figsize=(12, 6))
sns.heatmap(result,  annot=True, fmt='g', cmap='Blues')
plt.title("Time to Job per Job Search Hours")
plt.ylabel("Time To Job")
plt.xlabel("Job Search Hours")
plt.show()

In [None]:
#
# time to job per coding hours
#
data.groupby("time_to_job").agg({"coding_week_hours": "count"})

students_that_found_dev_job = data[data["has_dev_job"] == True]

result = students_that_found_dev_job.groupby("time_to_job").agg(
  less_than_5=("coding_week_hours", lambda x: (x == "Less than 5").sum()),
  between_5_and_10=("coding_week_hours", lambda x: (x == "Between 5 and 10").sum()),
  between_10_and_20=("coding_week_hours", lambda x: (x == "Between 10 and 20").sum()),
  between_20_and_30=("coding_week_hours", lambda x: (x == "Between 20 and 30").sum()),
  more_than_30=("coding_week_hours", lambda x: (x == "More than 30").sum())
)

categories = [
  'Less than 2 months',
  '3-4 months',
  '5-6 months',
  'More than 6 months'
]

result.index = pd.CategoricalIndex(
  result.index,
  categories=categories,
  ordered=True
  )

# Sort by the index (which is now an ordered categorical)
result = result.sort_index()

plt.figure(figsize=(12, 6))
sns.heatmap(result,  annot=True, fmt='g', cmap='Blues')
plt.title("Time to Job per Coding Hours")
plt.ylabel("Time To Job")
plt.xlabel("Coding Hours")
plt.show()