## Preliminary conclusions (draft)

- There doesn't seem to be a difference between students that find a dev job or not based on the quantity of hours invested in searching.

- There doesn't seem to be a difference between students that find a dev job or not based on the quantity of hours invested in coding.
  - Adding only two buckets (more than 10 hours and less than 10 hours) we noticed that students that dedicated more than 10 hours coding are more likely to get a dev job.
- Students that fall on the middle on search job hours and coding hours (10 to 20) tend to get jobs earlier.
  - There might be other factors as students that find a job later also are mostly within those 10 to 20 hour buckets.
- Most students said the bootcamp did help them, so it might not be too relevant for EDA


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown

In [None]:
data = pd.read_excel("../private_data/data/cleaned_data/cleaned_data.xlsx")

## Filter students that found a job but it's not related to the bootcamp

For now, we're considering only students that found a dev job AND students that did not find a job. <br>

ie. for the analysis, we're not considering students that found a job but it's not related to the bootcamp (at least for now, since we don't have relevant data about them)



In [None]:
#
# IMPORTANT: for this analysis we're using only students that found a dev job AND students that didn't find a job.
#
# (ie. we're not considering students that found a job but it's not related to the bootcamp)
#

data = data[(data["post_bootcamp_situation"] != "I found a job, but is not related to the bootcamp")]



# Helper functions

In [None]:
def render_group_by_column_vs_has_dev_job(df, column_to_render):

  result = df.groupby(column_to_render).agg(
    total_students=("ct_student_id", "size"),
    has_dev_job_true=("has_dev_job", lambda x: (x == True).sum()),
    has_dev_job_false=("has_dev_job", lambda x: (x == False).sum())
  )


  # Define the correct order of the categories
  ordered_categories_dict = {
    "did_create_portfolio": [
      False,
      True,
    ],
    "new_projects_qty": [
      'No Projects', 
      '1 Project', 
      '2 to 3 Projects',
      'More than 3 Projects'
    ],
    "job_search_week_hours": [
      "Less than 5",
      "Between 5 and 10",
      "Between 10 and 20",
      "Between 20 and 30",
      "More than 30"
    ],
    "coding_week_hours": [
      "Less than 5",
      "Between 5 and 10",
      "Between 10 and 20",
      "Between 20 and 30",
      "More than 30"
    ],
    "workload_balance_diff": list(range(-4, 5)), # values between -4 and 4
    "company_interview_qty": [
      "0 companies",
      "1 company",
      "2 companies",
      "3 companies",
      "4 companies",
      "5 companies",
      "6 companies",
      "7 companies",
      "8 companies",
      "9 companies",
      "10 or more companies",
    ]
  }

  # for total_hours: compute all the possible combinations of workload hours & add them to the dictionary
  hours_searching = [2.5, 7.5, 15, 25, 35]
  hours_coding = [2.5, 7.5, 15, 25, 35]
  hours_all_combinations = sorted({a + b for a in hours_searching for b in hours_coding})
  ordered_categories_dict["total_hours"] = hours_all_combinations

  # Convert index to ordered categorical
  result.index = pd.CategoricalIndex(
    result.index,
    categories=ordered_categories_dict[column_to_render],
    ordered=True
  )

  # Sort by the index (which is now an ordered categorical)
  display(result.sort_index())



In [None]:
#
# students that spend more time applying for jobs, have higher chances of getting a job.
#

display(Markdown("<br>"))
display(Markdown("### Hours searching"))
data_to_analyze = data
render_group_by_column_vs_has_dev_job(data_to_analyze, "job_search_week_hours")


In [None]:
#
# students that spend more time coding, have higher chances of getting a job.
#

# all students
display(Markdown("<br>"))
display(Markdown("### Hours coding (all students)"))
data_to_analyze = data
render_group_by_column_vs_has_dev_job(data_to_analyze, "coding_week_hours")



# only recent students
display(Markdown("<br>"))
display(Markdown("### Hours coding (only recent students)"))
students_2023_and_2024 = data[data["cohort_start_month"].str.startswith(("2023", "2024"))]
render_group_by_column_vs_has_dev_job(students_2023_and_2024, "coding_week_hours")

In [None]:
#
# students that spend more time in total (searching + coding), have higher chances of getting a job.
#

# all students
display(Markdown("<br>"))
display(Markdown("### Total hours (searching + coding)"))
data_to_analyze = data
render_group_by_column_vs_has_dev_job(data_to_analyze, "total_hours")


# only recent students
display(Markdown("<br>"))
display(Markdown("### Total hours (searching + coding) (only recent students)"))
students_2023_and_2024 = data[data["cohort_start_month"].str.startswith(("2023", "2024"))]
render_group_by_column_vs_has_dev_job(students_2023_and_2024, "total_hours")


In [None]:
#
# students that keep a good balance between hours searching & hours coding, have higher chances of getting a job.
#

# all students
display(Markdown("<br>"))
display(Markdown("### Workload balance"))
display(Markdown("- zero == similar workload"))
display(Markdown("- negative == more hours coding"))
display(Markdown("- positive == more hours searching"))
data_to_analyze = data
render_group_by_column_vs_has_dev_job(data_to_analyze, "workload_balance_diff")

# only students with a minimum workload
display(Markdown("<br>"))
display(Markdown("### Workload balance (only students with a minimum workload)"))
students_with_min_workload = data[data["total_hours"] >= 15]
render_group_by_column_vs_has_dev_job(students_with_min_workload, "workload_balance_diff")


# only recent students
display(Markdown("<br>"))
display(Markdown("### Workload balance (only recent students)"))
students_2023_and_2024 = data[data["cohort_start_month"].str.startswith(("2023", "2024"))]
render_group_by_column_vs_has_dev_job(students_2023_and_2024, "workload_balance_diff")




In [None]:
#
# students that create projects, have higher chances of getting a job. new_projects_qty
#
students_2023_and_2024 = data[data["cohort_start_month"].str.startswith(("2023", "2024"))]
render_group_by_column_vs_has_dev_job(students_2023_and_2024, "new_projects_qty")

In [None]:
#
# students that create a portfolio, have higher chances of getting a job
#
students_2023_and_2024 = data[data["cohort_start_month"].str.startswith(("2023", "2024"))]
render_group_by_column_vs_has_dev_job(data, "did_create_portfolio")

In [None]:
#
# amount of interviews to get a dev job
#

# students_2022
display(Markdown("<br>"))
display(Markdown("### students_2022:"))
students_2022 = data[data["cohort_start_month"].str.startswith(("2022"))]
render_group_by_column_vs_has_dev_job(students_2022, "company_interview_qty")

# students_2023
display(Markdown("<br>"))
display(Markdown("### students_2023:"))
students_2023 = data[data["cohort_start_month"].str.startswith(("2023"))]
render_group_by_column_vs_has_dev_job(students_2023, "company_interview_qty")

# students_2024
display(Markdown("<br>"))
display(Markdown("### students_2024:"))
students_2024 = data[data["cohort_start_month"].str.startswith(("2024"))]
render_group_by_column_vs_has_dev_job(students_2024, "company_interview_qty")

In [None]:
# time to job per job search hours
data.groupby("time_to_job").agg({"job_search_week_hours": "count"})

students_that_found_dev_job = data[data["has_dev_job"] == True]

result = students_that_found_dev_job.groupby("time_to_job").agg(
  less_than_5=("job_search_week_hours", lambda x: (x == "Less than 5").sum()),
  between_5_and_10=("job_search_week_hours", lambda x: (x == "Between 5 and 10").sum()),
  between_10_and_20=("job_search_week_hours", lambda x: (x == "Between 10 and 20").sum()),
  between_20_and_30=("job_search_week_hours", lambda x: (x == "Between 20 and 30").sum()),
  more_than_30=("job_search_week_hours", lambda x: (x == "More than 30").sum())
)

categories = [
  'Less than 2 months',
  '3-4 months',
  '5-6 months',
  'More than 6 months'
]

result.index = pd.CategoricalIndex(
  result.index,
  categories=categories,
  ordered=True
  )

# Sort by the index (which is now an ordered categorical)
result = result.sort_index()

plt.figure(figsize=(12, 6))
sns.heatmap(result,  annot=True, fmt='g', cmap='Blues')
plt.title("Time to Job per Job Search Hours")
plt.ylabel("Time To Job")
plt.xlabel("Job Search Hours")
plt.show()

In [None]:
# time to job per coding hours
data.groupby("time_to_job").agg({"coding_week_hours": "count"})

students_that_found_dev_job = data[data["has_dev_job"] == True]

result = students_that_found_dev_job.groupby("time_to_job").agg(
  less_than_5=("coding_week_hours", lambda x: (x == "Less than 5").sum()),
  between_5_and_10=("coding_week_hours", lambda x: (x == "Between 5 and 10").sum()),
  between_10_and_20=("coding_week_hours", lambda x: (x == "Between 10 and 20").sum()),
  between_20_and_30=("coding_week_hours", lambda x: (x == "Between 20 and 30").sum()),
  more_than_30=("coding_week_hours", lambda x: (x == "More than 30").sum())
)

categories = [
  'Less than 2 months',
  '3-4 months',
  '5-6 months',
  'More than 6 months'
]

result.index = pd.CategoricalIndex(
  result.index,
  categories=categories,
  ordered=True
  )

# Sort by the index (which is now an ordered categorical)
result = result.sort_index()

plt.figure(figsize=(12, 6))
sns.heatmap(result,  annot=True, fmt='g', cmap='Blues')
plt.title("Time to Job per Coding Hours")
plt.ylabel("Time To Job")
plt.xlabel("Coding Hours")
plt.show()