In [10]:
DATA_PATH = '../data/raw/survey_results_public.csv'
EXPORT_PATH = '../data/processed/1_processed_df.pkl'

REPLACE_DICT = {
    "YearsCode":{
         'Less than 1 year': 0,
        'More than 50 years':51
       
    },
    "YearsCodePro":{
        'Less than 1 year': 0,
        'More than 50 years':51,
        
        
    },
    "Age1stCode":{
        'Younger than 5 years':4,
        'Older than 85': 86
    },
}

In [2]:
import pandas as pd
import numpy as np
import logging
import pickle

# Functions

In [18]:
def split_answers(data_series, delimiter = ";"):
    """
    Split multiple answers in a single string 
    to a list of single strings each represnting a single answers 

    Parameters:
    * data_series (pd.Series): String series with answers 
    * delimiter (string): Another decimal integer 
                          Defaults to ";"

    Returns: (pd.Series): If column contains
    """
    #sub functions
    def is_splitable(data_series, delimiter):
        """ Check if results multiple should be splitted - Returns boolean """    
        return data_series.str.contains(delimiter)
    def split_answer(data_series, delimiter):
        """Function to split single answer"""
        return data_series.str.split(delimiter)
    #---------------------------------------------
    # Check if multiple answers exist - if none: return original 
    splitable_values = is_splitable(data_series, delimiter)
    if not splitable_values.any():
        return data_series
    #
    modified_series = split_answer(data_series, delimiter)

    #replace NAs values with empty list
    mask_null = modified_series.isnull()
    modified_series.loc[mask_null] = modified_series.loc[mask_null].apply(lambda x: [])
    
    return modified_series

# Prepocessing
## preprocess data

In [5]:
raw_df = pd.read_csv(DATA_PATH)
df = raw_df.copy()

### 1 - replace values and parse

In [11]:
for col, replacement in REPLACE_DICT.items():
    df[col] = df[col].replace(replacement).astype(np.float32)
    

### 2 - split nultiples answers

In [19]:
obj_cols = df.select_dtypes(include="object").columns.tolist()
for col in obj_cols:
    df[col] = split_answers(df[col])

In [20]:
print(obj_cols)

['MainBranch', 'Hobbyist', 'CompFreq', 'Country', 'CurrencyDesc', 'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith', 'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors', 'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith', 'MiscTechDesireNextYear', 'MiscTechWorkedWith', 'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps', 'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch', 'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms', 'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites', 'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear', 'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount', 'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength', 'Trans', 'UndergradMajor', 'WebframeDesireNextYear', 'WebframeWorkedWith', 'WelcomeChange']


## Visually verify results

In [26]:
i = df.sample(1).index[0]
print(i)
print(raw_df['DevType'].iloc[i])
print(df['DevType'].iloc[i])

45904
Developer, back-end;Developer, full-stack;Developer, QA or test
['Developer, back-end', 'Developer, full-stack', 'Developer, QA or test']


In [32]:
i = df.sample(1).index[0]
print(raw_df['YearsCodePro'].iloc[i])
print(df['YearsCodePro'].iloc[i])

12
12.0


### 3 - export data

In [33]:
df.to_pickle(EXPORT_PATH)

list