In [141]:
# Dependencies
import pandas as pd
import os

In [142]:
# load CSV
file = "Resources/2016-FCC-New-Coders-Survey-Data.csv"

In [143]:
# Read with pandas
bootcamp_data = pd.read_csv(file, low_memory = False)
bootcamp_data.head()

Unnamed: 0,Age,AttendedBootcamp,BootcampFinish,BootcampFullJobAfter,BootcampLoanYesNo,BootcampMonthsAgo,BootcampName,BootcampPostSalary,BootcampRecommend,ChildrenNumber,...,ResourceSoloLearn,ResourceStackOverflow,ResourceTreehouse,ResourceUdacity,ResourceUdemy,ResourceW3Schools,ResourceYouTube,SchoolDegree,SchoolMajor,StudentDebtOwe
0,28.0,0.0,,,,,,,,,...,,,,,,,,"some college credit, no degree",,20000.0
1,22.0,0.0,,,,,,,,,...,,,,,1.0,,,"some college credit, no degree",,
2,19.0,0.0,,,,,,,,,...,,,,,,,,high school diploma or equivalent (GED),,
3,26.0,0.0,,,,,,,,,...,,,,,,,,bachelor's degree,Cinematography And Film,7000.0
4,20.0,0.0,,,,,,,,,...,,,,,,,,"some college credit, no degree",,


In [144]:
# Extract only columns 0, 1, 2, 3, 4, 7, 8, 9, 10,11, 29, 30, 32, 36, 37, 45, 48, 56, 110, 111
main_df = bootcamp_data.iloc[:,[0, 1, 2, 3, 4, 7, 8, 9, 10,11, 29, 30, 32, 36, 37, 45, 48, 56, 110, 111]]

In [145]:
# Change "0" to "No" and "1" to "Yes" in response columns
main_df.replace(to_replace = [0.0, 1.0], value = ["No", "Yes"], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [146]:
# Calculate total number of respondents in survey
total_resp  = len(main_df)

In [147]:
# Extract rows corresponding only to people who attended a bootcamp
attendee_df = main_df.loc[main_df["AttendedBootcamp"] == "Yes", :]
attendee_df.head()

Unnamed: 0,Age,AttendedBootcamp,BootcampFinish,BootcampFullJobAfter,BootcampLoanYesNo,BootcampPostSalary,BootcampRecommend,ChildrenNumber,CityPopulation,CodeEventBootcamp,CountryLive,EmploymentField,EmploymentStatus,Gender,HasChildren,HoursLearning,Income,JobRoleInterest,SchoolDegree,SchoolMajor
93,32.0,Yes,Yes,No,No,,No,,"between 100,000 and 1 million",,United States of America,"arts, entertainment, sports, or media",Self-employed business owner,male,,20,67000.0,,bachelor's degree,Biology
97,26.0,Yes,Yes,Yes,No,45000.0,No,,more than 1 million,,United States of America,software development,Employed for wages,male,No,10,40000.0,,master's degree (non-professional),Music
130,41.0,Yes,Yes,Yes,Yes,75000.0,Yes,3.0,"less than 100,000",,United States of America,software development,Employed for wages,male,Yes,30,75000.0,,"some college credit, no degree",
159,26.0,Yes,Yes,No,No,,No,,"between 100,000 and 1 million",,United States of America,,Not working and not looking for work,female,,30,,Full-Stack Web Developer,"some college credit, no degree",
188,24.0,Yes,No,,Yes,,No,,"between 100,000 and 1 million",,Canada,,Not working but looking for work,female,,60,,,"some college credit, no degree",


In [148]:
# Calculate average age of attendees
avg_age = "{:.2f}".format(attendee_df["Age"].mean(axis = 0))
# Calculate how many people attended a bootcamp
attended_amt = len(attendee_df)
# Calculate how many attendees hold degrees
degrees = ["bachelor's degree",\
 "master's degree (non-professional)", \
 "professional degree (MBA, MD, JD, etc.)",\
 "associate's degree","Ph.D."]
attendee_dgr = len(attendee_df[attendee_df.SchoolDegree.isin(degrees)])

# Count number of attendees who self-identify as male; female; or are of non-binary gender identification
total_gen_resp = attendee_df["Gender"].count()
id_male = attendee_df["Gender"].value_counts()['male']
id_female = attendee_df["Gender"].value_counts()['female']
id_nonbinary = attendee_df["Gender"].count() - id_male - id_female
# Calculate percentage of respondents who attended a bootcamp
per_attend = (attended_amt/total_resp) * 100
per_attend = "{:.2f}%".format(per_attend)

# Calculate percentage of respondents belonging to each gender
per_male = (id_male/total_gen_resp) * 100
per_female = (id_female/total_gen_resp) * 100
per_nonbin = (id_nonbinary/total_gen_resp) * 100
per_male = "{:.2f}%".format(per_male)
per_female = "{:.2f}%".format(per_female)
per_nonbin = "{:.2f}%".format(per_nonbin)
# Calculate percentage of attendees with a school degree
per_dgr = (attendee_dgr/attended_amt) * 100
per_dgr = "{:.2f}%".format(per_dgr)
# Calculate average post-bootcamp salary
avg_post_salary = attendee_df["BootcampPostSalary"].mean(axis = 0)
avg_post_salary = "${:.2f}".format(avg_post_salary)

In [149]:
# Create a new table consolodating above calculations
bc_data_output = pd.DataFrame({
    "Total Surveyed": total_resp,
    "Bootcamp Attended": attended_amt,
    "% Attended": per_attend,
    "Avg Age": avg_age,
    "% Has Degree": per_dgr,
    "% Male": per_male,
    "% Female": per_female,
    "% Nonbinary": per_nonbin,
    "Average Post Bootcamp Salary": avg_post_salary
}, index = [0])

In [150]:
# Improve formatting before outputting spreadsheet
bc_data_output

Unnamed: 0,Total Surveyed,Bootcamp Attended,% Attended,Avg Age,% Has Degree,% Male,% Female,% Nonbinary,Average Post Bootcamp Salary
0,15620,953,6.10%,31.07,66.84%,59.54%,39.14%,1.32%,$63740.51


In [None]:
bc_data_output.to_csv("Output/output.csv", index = False)