In [None]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../scripts/eda/")))
from group_by_column_vs_has_dev_job import group_by_column_vs_has_dev_job

In [None]:
data = pd.read_excel("../private_data/data/cleaned_data/cleaned_data.xlsx")

## Filter only relevant students




In [None]:
#
# Filter students that spent some minimum workload
#

data = data[data["total_hours"] > 15]



#
# Filter:: students that found a dev job AND students that didn't find a job.
#
# (ie. we're not considering students that found a job but it's not related to the bootcamp)
#

data = data[(data["post_bootcamp_situation"] != "I found a job, but is not related to the bootcamp")]




## EDA

In [None]:
#
# students that keep a good balance between hours searching & hours coding, have higher chances of getting a job.
#

# all students
display(Markdown("<br>"))
display(Markdown("### Workload balance"))
display(Markdown("- zero == similar workload (ie. spent a similar amount of time searching than coding)"))
display(Markdown("- negative == more hours coding"))
display(Markdown("- positive == more hours searching"))
data_to_analyze = data
r1 = group_by_column_vs_has_dev_job(data_to_analyze, "workload_balance_diff")
display(r1)

# # only students with a minimum workload
# display(Markdown("<br>"))
# display(Markdown("### Workload balance (only students with a minimum workload)"))
# students_with_min_workload = data[data["total_hours"] >= 15]
# r2 = group_by_column_vs_has_dev_job(students_with_min_workload, "workload_balance_diff")
# display(r2)

# # only recent students
# display(Markdown("<br>"))
# display(Markdown("### Workload balance (only recent students)"))
# students_2023_and_2024 = data[data["cohort_start_month"].str.startswith(("2023", "2024"))]
# r3 = group_by_column_vs_has_dev_job(students_2023_and_2024, "workload_balance_diff")
# display(r3)



In [None]:

workload_df = data[["workload_balance_diff", "has_dev_job"]].copy()

update_balance_dict = {-1: "Lean towards coding", 1: "Lean towards search", 0: "Balanced"}
workload_df["workload_balance_diff"] = workload_df["workload_balance_diff"].replace(update_balance_dict)

update_boolean_dict = {True: 1, False: 0}
workload_df["has_dev_job"] = workload_df["has_dev_job"].replace(update_boolean_dict)

workload_df

In [None]:
# Count of each category in workload_balance_diff
print("\nDistribution of workload balance categories:")
print(workload_df['workload_balance_diff'].value_counts())

## chi2_contingency

In [None]:
from scipy.stats import chi2_contingency

In [None]:
#
# Hypothesis 1: students that keep a good balance, have better chances of getting a job than students that lean towards coding
# - H0: job success if lean towards coding >= job success if good balance
# - Ha: job success if good balance > job success if lean towards coding


# Filter the data for the two groups of interest
filtered_df = workload_df[workload_df['workload_balance_diff'].isin(['Lean towards coding', 'Balanced'])]

# Create a contingency table for these two groups
contingency_table = pd.crosstab(filtered_df['workload_balance_diff'], filtered_df['has_dev_job'])

# Perform the Chi-Square test
_, pvalue, _, _ = chi2_contingency(contingency_table)

display(f"p-value: {pvalue}")



In [None]:
#
# Hypothesis 2: students that keep a good balance, have better chances of getting a job than students that lean towards search
# - H0: job success if lean towards search >= job success if good balance
# - Ha: job success if good balance > job success if lean towards search


# Filter the data for the two groups of interest
filtered_df = workload_df[workload_df['workload_balance_diff'].isin(['Lean towards search', 'Balanced'])]

# Create a contingency table for these two groups
contingency_table = pd.crosstab(filtered_df['workload_balance_diff'], filtered_df['has_dev_job'])

# Perform the Chi-Square test
_, pvalue, _, _ = chi2_contingency(contingency_table)

print(pvalue)

In [None]:
#
# Hypothesis 3: students that lean towards coding, have better chances of getting a job than students that lean towards search
# - H0: job success if lean towards search >= job success if lean towards coding
# - Ha: job success if lean towards coding > job success if lean towards search


# Filter the data for the two groups of interest
filtered_df = workload_df[workload_df['workload_balance_diff'].isin(['Lean towards search', 'Lean towards coding'])]

# Create a contingency table for these two groups
contingency_table = pd.crosstab(filtered_df['workload_balance_diff'], filtered_df['has_dev_job'])

# Perform the Chi-Square test
_, pvalue, _, _ = chi2_contingency(contingency_table)

print(pvalue)

## chi2_contingency + chart

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats



# Calculate job success rate for each workload balance category
job_rates = workload_df.groupby('workload_balance_diff')['has_dev_job'].mean()
print("\nJob success rate for each category:")
print(job_rates)

# get data to test the hypothesis that balanced > lean towards coding
balanced_data = workload_df[workload_df['workload_balance_diff'] == 'Balanced']['has_dev_job']
coding_data = workload_df[workload_df['workload_balance_diff'] == 'Lean towards coding']['has_dev_job']

# Create contingency table for these two categories
contingency = pd.crosstab(
    workload_df[workload_df['workload_balance_diff'].isin(['Balanced', 'Lean towards coding'])]['workload_balance_diff'],
    workload_df[workload_df['workload_balance_diff'].isin(['Balanced', 'Lean towards coding'])]['has_dev_job']
)
display("\nContingency table (Balanced vs. Lean towards coding):")
display(contingency)

# Perform chi-square test
chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
print(f"\nChi-square test results:")
print(f"Chi-square value: {chi2:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Degrees of freedom: {dof}")

# Perform z-test for proportions
balanced_success = balanced_data.sum()
balanced_total = len(balanced_data)
coding_success = coding_data.sum()
coding_total = len(coding_data)

balanced_rate = balanced_success / balanced_total
coding_rate = coding_success / coding_total

# Calculate z-statistic for one-tailed test
pooled_p = (balanced_success + coding_success) / (balanced_total + coding_total)
se = np.sqrt(pooled_p * (1 - pooled_p) * (1/balanced_total + 1/coding_total))
z_stat = (balanced_rate - coding_rate) / se
p_value_z = 1 - stats.norm.cdf(z_stat)  # One-tailed test

print("\nZ-test for proportions:")
print(f"Balanced success rate: {balanced_rate:.4f} ({balanced_success}/{balanced_total})")
print(f"Coding success rate: {coding_rate:.4f} ({coding_success}/{coding_total})")
print(f"Z-statistic: {z_stat:.4f}")
print(f"p-value (one-tailed): {p_value_z:.4f}")

# Display chart with job success rates
plt.figure(figsize=(10, 6))
sns.barplot(x=job_rates.index, y=job_rates.values)
plt.title('Job Success Rate by Workload Balance Strategy')
plt.xlabel('Workload Balance Strategy')
plt.ylabel('Job Success Rate')
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Final conclusion
alpha = 0.05
print("\nHypothesis Test Conclusion:")
if p_value_z < alpha:
    print(f"At α = {alpha}, we reject the null hypothesis.")
    print("There is statistically significant evidence that students with a balanced workload")
    print("have a higher job success rate than those who lean towards coding.")
else:
    print(f"At α = {alpha}, we fail to reject the null hypothesis.")
    print("There is insufficient evidence to conclude that students with a balanced workload")
    print("have a higher job success rate than those who lean towards coding.")

## proportions_ztest

In [None]:
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest

# Assuming your DataFrame is called df

# Function to perform z-test between two workload groups
def z_test_between_groups(df, group1, group2):
    # Get counts of successes (has_dev_job == 1)
    success_counts = [
        df[(df['workload_balance_diff'] == group1) & (df['has_dev_job'] == 1)].shape[0],
        df[(df['workload_balance_diff'] == group2) & (df['has_dev_job'] == 1)].shape[0]
    ]

    # Get total sample sizes for both groups
    nobs = [
        df[df['workload_balance_diff'] == group1].shape[0],
        df[df['workload_balance_diff'] == group2].shape[0]
    ]

    stat, pval = proportions_ztest(success_counts, nobs)
    return pval

# Test 1: Leans Towards Search vs Balanced
pval_search_vs_balanced = z_test_between_groups(workload_df, "Lean towards search", "Balanced")

# Test 2: Leans Towards Coding vs Balanced
pval_coding_vs_balanced = z_test_between_groups(workload_df, "Lean towards coding", "Balanced")

print("P-value (search vs balanced):", pval_search_vs_balanced)
print("P-value (coding vs balanced):", pval_coding_vs_balanced)




##  t-test independent

In [None]:
from scipy.stats import ttest_ind


In [None]:
workload_df["workload_balance_diff"].value_counts()


balanced_series = workload_df[workload_df["workload_balance_diff"] == "Balanced"]["has_dev_job"]
lean_search_series = workload_df[workload_df["workload_balance_diff"] == "Lean towards search"]["has_dev_job"]
lean_coding_series = workload_df[workload_df["workload_balance_diff"] == "Lean towards coding"]["has_dev_job"]

display("Number of students in the sample:")
display(f"balanced_series: {len(balanced_series)}")
display(f"lean_search_series: {len(lean_search_series)}")
display(f"lean_coding_series: {len(lean_coding_series)}")


# balanced vs. lean towards search
statistic, pvalue = ttest_ind(balanced_series, lean_search_series, equal_var=False)
print(f"{round(pvalue, 4)} (P-value balanced vs. lean towards search) ")

# balanced vs. lean towards coding
statistic, pvalue = ttest_ind(balanced_series, lean_coding_series, equal_var=False)
print(f"{round(pvalue, 4)} (P-value balanced vs. lean towards coding) ")

# lean towards search vs. lean towards coding
statistic, pvalue = ttest_ind(lean_search_series, lean_coding_series, equal_var=False)
print(f"{round(pvalue, 4)} (P-value lean towards search vs. lean towards coding) ")
