In [55]:
# Import the data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from pathlib import Path
file_path = Path("Resources/2017-fCC-New-Coders-Survey-Data.csv")

data_cleaned = pd.read_csv(file_path)
data_cleaned.head()

  data_cleaned = pd.read_csv(file_path)


Unnamed: 0,Age,AttendedBootcamp,BootcampFinish,BootcampLoanYesNo,BootcampName,BootcampRecommend,ChildrenNumber,CityPopulation,CodeEventConferences,CodeEventDjangoGirls,...,YouTubeFCC,YouTubeFunFunFunction,YouTubeGoogleDev,YouTubeLearnCode,YouTubeLevelUpTuts,YouTubeMIT,YouTubeMozillaHacks,YouTubeOther,YouTubeSimplilearn,YouTubeTheNewBoston
0,27.0,0.0,,,,,,more than 1 million,,,...,,,,,,,,,,
1,34.0,0.0,,,,,,"less than 100,000",,,...,1.0,,,,,,,,,
2,21.0,0.0,,,,,,more than 1 million,,,...,,,,1.0,1.0,,,,,
3,26.0,0.0,,,,,,"between 100,000 and 1 million",,,...,1.0,1.0,,,1.0,,,,,
4,20.0,0.0,,,,,,"between 100,000 and 1 million",,,...,,,,,,,,,,


In [56]:
data_cleaned.info()
data_cleaned.columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18175 entries, 0 to 18174
Columns: 136 entries, Age to YouTubeTheNewBoston
dtypes: float64(105), object(31)
memory usage: 18.9+ MB


Index(['Age', 'AttendedBootcamp', 'BootcampFinish', 'BootcampLoanYesNo',
       'BootcampName', 'BootcampRecommend', 'ChildrenNumber', 'CityPopulation',
       'CodeEventConferences', 'CodeEventDjangoGirls',
       ...
       'YouTubeFCC', 'YouTubeFunFunFunction', 'YouTubeGoogleDev',
       'YouTubeLearnCode', 'YouTubeLevelUpTuts', 'YouTubeMIT',
       'YouTubeMozillaHacks', 'YouTubeOther', 'YouTubeSimplilearn',
       'YouTubeTheNewBoston'],
      dtype='object', length=136)

In [57]:
columns_to_drop=['YouTubeFCC', 'YouTubeFunFunFunction', 'YouTubeGoogleDev', 'YouTubeLearnCode', 'YouTubeLevelUpTuts', 'YouTubeMIT', 'YouTubeMozillaHacks', 'YouTubeOther', 'YouTubeSimplilearn', 'YouTubeTheNewBoston', 'YouTubeDerekBanas', 'YouTubeDevTips', 'YouTubeEngineeredTruth']
data_cleaned.drop(columns=columns_to_drop, inplace=True)

data_cleaned.columns

data_cleaned.to_csv("Newdf.csv",index=False)

In [58]:
data_cleaned.dtypes


Age                     float64
AttendedBootcamp        float64
BootcampFinish          float64
BootcampLoanYesNo       float64
BootcampName             object
                         ...   
StudentDebtOwe          float64
YouTubeCodeCourse       float64
YouTubeCodingTrain      float64
YouTubeCodingTut360     float64
YouTubeComputerphile    float64
Length: 123, dtype: object

In [59]:
# Check for missing values
missing_values = data_cleaned.isnull().sum()
print(missing_values)

Age                      2808
AttendedBootcamp          466
BootcampFinish          17106
BootcampLoanYesNo       17096
BootcampName            17226
                        ...  
StudentDebtOwe          14813
YouTubeCodeCourse       17219
YouTubeCodingTrain      17199
YouTubeCodingTut360     16778
YouTubeComputerphile    16722
Length: 123, dtype: int64


In [60]:
# Step 1: Handle missing values by removing rows with missing values
data_cleaned = data_cleaned.dropna()

In [61]:
# Step 2: Remove duplicate records
data_cleaned = data_cleaned.drop_duplicates()

In [62]:
# Step 3: Merge Columns
# Merge 'Do you financially support any dependents?' and 'Do you have children?' into 'Dependents'
def merge_dependents(row):
    # Check if both columns exist and apply logic
    if 'Do you financially support any dependents?' in row.index and 'Do you have children?' in row.index:
        if row['Do you financially support any dependents?'] == 'Yes' or row['Do you have children?'] == 'Yes':
            return 'Yes'
        else:
            return 'No'
    return np.nan  # Return NaN if columns do not exist
# Apply the function to create the 'Dependents' column
data_cleaned['Dependents'] = data_cleaned.apply(merge_dependents, axis=1)
# Merge 'Do you have student loan debt?', 'Do you have any debt?', and 'Do you have a home mortgage?' into 'Debt status category'
def merge_debt(row):
    if 'Do you have student loan debt?' in row.index and row['Do you have student loan debt?'] == 'Yes':
        return 'Student Loan'
    elif 'Do you have a home mortgage?' in row.index and row['Do you have a home mortgage?'] == 'Yes':
        return 'Mortgage'
    elif 'Do you have any debt?' in row.index and row['Do you have any debt?'] == 'Yes':
        return 'Other Debt'
    else:
        return 'No Debt'
# Apply the function to create the 'Debt status category' column
data_cleaned['Debt status category'] = data_cleaned.apply(merge_debt, axis=1)
# Merge 'Other' columns ('Other' gender, employment/school, career)
def merge_other(row):
    if 'Other' in row.index and pd.notna(row['Other']):
        return row['Other']
    elif 'Other.1' in row.index and pd.notna(row['Other.1']):
        return row['Other.1']
    elif 'Other.2' in row.index and pd.notna(row['Other.2']):
        return row['Other.2']
    return np.nan
# Apply the function to merge 'Other' columns
data_cleaned['Other'] = data_cleaned.apply(merge_other, axis=1)














In [63]:
# Step 4: Remove irrelevant columns
columns_to_remove = ['Submit Date (UTC)', 'Start Date (UTC)', 'Network ID', 'Other.1', 'Other.2', 'Other']
data_cleaned = data_cleaned.drop(columns=[col for col in columns_to_remove if col in data_cleaned.columns])
# Step 5: Handle any inconsistencies in the data (assuming none for now)
# The data is now cleaned and ready for machine learning algorithms.
# Optional: Save the cleaned data to a new CSV file
data_cleaned.to_csv('cleaned_survey_data.csv', index=False)

In [64]:
# Step 6: Remove irrelevant columns if they exist in the dataset
columns_to_remove = ['Submit Date (UTC)', 'Start Date (UTC)', 'Network ID', '#', 'Other', 'Other.1', 'Other.2']
# Check if each column exists before dropping
data_cleaned = data_cleaned.drop(columns=[col for col in columns_to_remove if col in data_cleaned.columns], errors='ignore')

In [65]:
# Step 7: Display cleaned data (or save it to a new CSV file)
print(data_cleaned.head())  # Display the first few rows
# If you want to save the cleaned data to a new file
data_cleaned.to_csv('cleaned_data.csv', index=False)

Empty DataFrame
Columns: [Age, AttendedBootcamp, BootcampFinish, BootcampLoanYesNo, BootcampName, BootcampRecommend, ChildrenNumber, CityPopulation, CodeEventConferences, CodeEventDjangoGirls, CodeEventFCC, CodeEventGameJam, CodeEventGirlDev, CodeEventHackathons, CodeEventMeetup, CodeEventNodeSchool, CodeEventNone, CodeEventOther, CodeEventRailsBridge, CodeEventRailsGirls, CodeEventStartUpWknd, CodeEventWkdBootcamps, CodeEventWomenCode, CodeEventWorkshops, CommuteTime, CountryCitizen, CountryLive, EmploymentField, EmploymentFieldOther, EmploymentStatus, EmploymentStatusOther, ExpectedEarning, FinanciallySupporting, FirstDevJob, Gender, GenderOther, HasChildren, HasDebt, HasFinancialDependents, HasHighSpdInternet, HasHomeMortgage, HasServedInMilitary, HasStudentDebt, HomeMortgageOwe, HoursLearning, ID.x, ID.y, Income, IsEthnicMinority, IsReceiveDisabilitiesBenefits, IsSoftwareDev, IsUnderEmployed, JobApplyWhen, JobInterestBackEnd, JobInterestDataEngr, JobInterestDataSci, JobInterestDevO

In [66]:
# Step 1: Label Encoding for small categories and ordinal features
label_encoder = LabelEncoder()
# Label encode Gender (if binary or small categories)
if 'Gender' in data_cleaned.columns:
    data_cleaned['Gender_Encoded'] = label_encoder.fit_transform(data_cleaned['Gender'])
# Label encode Degree Level (ordinal)
if 'Degree Level' in data_cleaned.columns:
    data_cleaned['Degree_Level_Encoded'] = label_encoder.fit_transform(data_cleaned['Degree Level'])
# Label encode Employment Status
if 'Employment Status' in data_cleaned.columns:
    data_cleaned['Employment_Status_Encoded'] = label_encoder.fit_transform(data_cleaned['Employment Status'])
# Label encode Student Loan Debt (Yes/No or binary)
if 'Do you have student loan debt?' in data_cleaned.columns:
    data_cleaned['Student_Loan_Debt_Encoded'] = label_encoder.fit_transform(data_cleaned['Do you have student loan debt?'])
# Label encode Family Responsibilities (Yes/No for dependents, children)
if 'Dependents' in data_cleaned.columns:
    data_cleaned['Dependents_Encoded'] = label_encoder.fit_transform(data_cleaned['Dependents'])
# Step 2: One-Hot Encoding for unordered categorical variables
# One-hot encode Country of Residence
if 'Country of Residence' in data_cleaned.columns:
    data_cleaned = pd.get_dummies(data_cleaned, columns=['Country of Residence'], prefix='Country')
# One-hot encode Citizenship
if 'Citizenship' in data_cleaned.columns:
    data_cleaned = pd.get_dummies(data_cleaned, columns=['Citizenship'], prefix='Citizenship')
# One-hot encode Job Roles
if 'Job Roles' in data_cleaned.columns:
    data_cleaned = pd.get_dummies(data_cleaned, columns=['Job Roles'], prefix='Job_Role')
# One-hot encode Field of Study
if 'Field of Study' in data_cleaned.columns:
    data_cleaned = pd.get_dummies(data_cleaned, columns=['Field of Study'], prefix='Field_Study')
# One-hot encode Learning Preferences
if 'Learning Preferences' in data_cleaned.columns:
    data_cleaned = pd.get_dummies(data_cleaned, columns=['Learning Preferences'], prefix='Learning_Pref')
# One-hot encode Coding Events/Workshops
if 'Coding Events/Workshops' in data_cleaned.columns:
    data_cleaned = pd.get_dummies(data_cleaned, columns=['Coding Events/Workshops'], prefix='Coding_Event')
# One-hot encode Employment Type Preferences
if 'Employment Type Preferences' in data_cleaned.columns:
    data_cleaned = pd.get_dummies(data_cleaned, columns=['Employment Type Preferences'], prefix='Employment_Type')
# Optional: Step 3: Handling High-Cardinality Features
# For very high-cardinality categorical features like 'Country', if dimensionality becomes an issue,
# consider using binary encoding from the 'category_encoders' library.
# Example: Using binary encoding for 'Country' feature with many categories
# Install the category_encoders package if needed:
# !pip install category_encoders
# from category_encoders import BinaryEncoder
# encoder = BinaryEncoder(cols=['Country'])
# data_cleaned = encoder.fit_transform(data_cleaned)
# Save the encoded dataset to a new CSV file
data_cleaned.to_csv('encoded_survey_data.csv', index=False)


In [67]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Load the dataset (replace with your actual file path)
data_cleaned = pd.read_csv('../clean-data/2017-fCC-New-Coders-Survey-Data.csv')
# Step 1: Identify Numerical Columns
# Select only numerical columns (int64 and float64)
numerical_columns = data_cleaned.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Step 2: Exclude binary or categorical columns from scaling
# Define the binary or categorical columns that should not be scaled
exclude_columns = ['Dependents', 'Gender', 'Employment Status']  # Add any other binary or categorical columns here
# Identify the numerical columns to scale by excluding the binary/categorical columns
columns_to_scale = [col for col in numerical_columns if col not in exclude_columns]
print("Columns to scale: ", columns_to_scale)
# Step 3: Normalize the selected columns using MinMaxScaler
scaler = MinMaxScaler()
data_cleaned[columns_to_scale] = scaler.fit_transform(data_cleaned[columns_to_scale])
# Step 4: Standardize the selected columns using StandardScaler (if you want to standardize instead of normalize)
# scaler = StandardScaler()
# data_cleaned[columns_to_scale] = scaler.fit_transform(data_cleaned[columns_to_scale])
# The data is now normalized (or standardized if you uncomment the StandardScaler)
# Optional: Save the final processed data
data_cleaned.to_csv('final_processed_data.csv', index=False)
# Check the first few rows of the scaled data
print(data_cleaned.head())

  data_cleaned = pd.read_csv('../clean-data/2017-fCC-New-Coders-Survey-Data.csv')


Columns to scale:  ['Age', 'AttendedBootcamp', 'BootcampFinish', 'BootcampLoanYesNo', 'BootcampRecommend', 'ChildrenNumber', 'CodeEventConferences', 'CodeEventDjangoGirls', 'CodeEventFCC', 'CodeEventGameJam', 'CodeEventGirlDev', 'CodeEventHackathons', 'CodeEventMeetup', 'CodeEventNodeSchool', 'CodeEventNone', 'CodeEventRailsBridge', 'CodeEventRailsGirls', 'CodeEventStartUpWknd', 'CodeEventWkdBootcamps', 'CodeEventWomenCode', 'CodeEventWorkshops', 'ExpectedEarning', 'FinanciallySupporting', 'FirstDevJob', 'HasChildren', 'HasDebt', 'HasFinancialDependents', 'HasHighSpdInternet', 'HasHomeMortgage', 'HasServedInMilitary', 'HasStudentDebt', 'HomeMortgageOwe', 'HoursLearning', 'Income', 'IsEthnicMinority', 'IsReceiveDisabilitiesBenefits', 'IsSoftwareDev', 'IsUnderEmployed', 'JobInterestBackEnd', 'JobInterestDataEngr', 'JobInterestDataSci', 'JobInterestDevOps', 'JobInterestFrontEnd', 'JobInterestFullStack', 'JobInterestGameDev', 'JobInterestInfoSec', 'JobInterestMobile', 'JobInterestProjMngr'

In [68]:
# Check for missing values in 'AttendedBootcamp' and 'BootcampFinish'
print(data_cleaned['AttendedBootcamp'].isnull().sum())
print(data_cleaned['BootcampFinish'].isnull().sum())

466
17106


In [69]:
# Drop rows where 'AttendedBootcamp' or 'BootcampFinish' have NaN values
data_cleaned = data_cleaned.dropna(subset=['AttendedBootcamp', 'BootcampFinish'])

In [70]:
# Define features (X) and targets ('AttendedBootcamp' and 'BootcampFinish') again after removing NaNs
X = data_cleaned.drop(columns=['AttendedBootcamp', 'BootcampFinish'])  # Features
y_attend = data_cleaned['AttendedBootcamp']  # Target 1: Attend Bootcamp
y_finish = data_cleaned['BootcampFinish']  # Target 2: Finish Bootcamp

In [71]:
from sklearn.model_selection import train_test_split
# Split data for 'AttendedBootcamp' prediction with stratified sampling
X_train_attend, X_test_attend, y_train_attend, y_test_attend = train_test_split(X, y_attend, test_size=0.3, stratify=y_attend, random_state=42)

In [72]:
# Split data for 'BootcampFinish' prediction with stratified sampling
X_train_finish, X_test_finish, y_train_finish, y_test_finish = train_test_split(X, y_finish, test_size=0.3, stratify=y_finish, random_state=42)

In [73]:
train_attend = pd.concat([X_train_attend, y_train_attend], axis=1)
test_attend = pd.concat([X_test_attend, y_test_attend], axis=1)
train_attend['Set'] = 'Train'
test_attend['Set'] = 'Test'
combined_attend = pd.concat([train_attend, test_attend])
combined_attend.to_csv('attended_bootcamp_data.csv', index=False)

In [74]:
# For 'BootcampFinish' target
train_finish = pd.concat([X_train_finish, y_train_finish], axis=1)
test_finish = pd.concat([X_test_finish, y_test_finish], axis=1)
train_finish['Set'] = 'Train'
test_finish['Set'] = 'Test'
combined_finish = pd.concat([train_finish, test_finish])
combined_finish.to_csv('bootcamp_finish_data.csv', index=False)

In [75]:
# Define file paths for each member (you can modify these paths as needed)
path_ayana = "datasets/ayana/"
path_roberta = "datasets/roberta/"
path_dom = "datasets/dom/"
path_phillip = "datasets/phillip/"


In [76]:
import os

In [77]:
# Create directories if they don't exist
os.makedirs(path_ayana, exist_ok=True)
os.makedirs(path_roberta, exist_ok=True)
os.makedirs(path_dom, exist_ok=True)
os.makedirs(path_phillip, exist_ok=True)

In [78]:
# Ensure the paths exist (if necessary)
# For Ayana: Logistic Regression & Decision Trees (AttendedBootcamp and BootcampFinish)
# Export Ayana's training and test datasets for AttendedBootcamp
X_train_attend.to_csv(path_ayana + "X_train_attend.csv", index=False)
y_train_attend.to_csv(path_ayana + "y_train_attend.csv", index=False)
X_test_attend.to_csv(path_ayana + "X_test_attend.csv", index=False)
y_test_attend.to_csv(path_ayana + "y_test_attend.csv", index=False)

In [79]:
# Export Ayana's training and test datasets for BootcampFinish
X_train_finish.to_csv(path_ayana + "X_train_finish.csv", index=False)
y_train_finish.to_csv(path_ayana + "y_train_finish.csv", index=False)
X_test_finish.to_csv(path_ayana + "X_test_finish.csv", index=False)
y_test_finish.to_csv(path_ayana + "y_test_finish.csv", index=False)

In [80]:
from sklearn.tree import DecisionTreeClassifier

In [81]:
# For Roberta: Random Forest
X_train_attend.to_csv(path_roberta + "X_train_attend.csv", index=False)
y_train_attend.to_csv(path_roberta + "y_train_attend.csv", index=False)
X_test_attend.to_csv(path_roberta + "X_test_attend.csv", index=False)
y_test_attend.to_csv(path_roberta + "y_test_attend.csv", index=False)
X_train_finish.to_csv(path_roberta + "X_train_finish.csv", index=False)
y_train_finish.to_csv(path_roberta + "y_train_finish.csv", index=False)
X_test_finish.to_csv(path_roberta + "X_test_finish.csv", index=False)
y_test_finish.to_csv(path_roberta + "y_test_finish.csv", index=False)

In [82]:
# For Dom: SVM & KNN
X_train_attend.to_csv(path_dom + "X_train_attend.csv", index=False)
y_train_attend.to_csv(path_dom + "y_train_attend.csv", index=False)
X_test_attend.to_csv(path_dom + "X_test_attend.csv", index=False)
y_test_attend.to_csv(path_dom + "y_test_attend.csv", index=False)
X_train_finish.to_csv(path_dom + "X_train_finish.csv", index=False)
y_train_finish.to_csv(path_dom + "y_train_finish.csv", index=False)
X_test_finish.to_csv(path_dom + "X_test_finish.csv", index=False)
y_test_finish.to_csv(path_dom + "y_test_finish.csv", index=False)


In [83]:
# For Phillip: XGBoost & LightGBM
X_train_attend.to_csv(path_phillip + "X_train_attend.csv", index=False)
y_train_attend.to_csv(path_phillip + "y_train_attend.csv", index=False)
X_test_attend.to_csv(path_phillip + "X_test_attend.csv", index=False)
y_test_attend.to_csv(path_phillip + "y_test_attend.csv", index=False)
X_train_finish.to_csv(path_phillip + "X_train_finish.csv", index=False)
y_train_finish.to_csv(path_phillip + "y_train_finish.csv", index=False)
X_test_finish.to_csv(path_phillip + "X_test_finish.csv", index=False)
y_test_finish.to_csv(path_phillip + "y_test_finish.csv", index=False)

In [84]:
# you can export the clean dataset for each member as well
data_cleaned.to_csv(path_ayana + "clean_dataset.csv", index=False)
data_cleaned.to_csv(path_roberta + "clean_dataset.csv", index=False)
data_cleaned.to_csv(path_dom + "clean_dataset.csv", index=False)
data_cleaned.to_csv(path_phillip + "clean_dataset.csv", index=False)

# Missing Values

In [88]:
# Find the percentage of null values in each column
X_train_attend.isna().sum()/len(X_train_attend)
X_train_finish.isna().sum()/len(X_train_finish)
#X_train.isna().sum()/len(X_train)

Age                    0.159091
BootcampLoanYesNo      0.006684
BootcampName           0.129679
BootcampRecommend      0.012032
ChildrenNumber         0.871658
                         ...   
YouTubeMIT             0.851604
YouTubeMozillaHacks    0.963904
YouTubeOther           0.925134
YouTubeSimplilearn     0.983957
YouTubeTheNewBoston    0.897059
Length: 134, dtype: float64

In [None]:
# Explore each column with missing values to determine the best fill strategy
# First the job column
X_train['<Choose Column>'].value_counts()