In [1]:
# Constants
DATA_PATH   = "../data/raw/survey_results_public.csv"
EXPORT_PATH = "../data/processed/1_preprocessed_df.pkl"

REPLACE_DICT = {
    'YearsCodePro': {'Less than 1 year': 0, 'More than 50 years': 51}, 
    'YearsCode':    {'Less than 1 year': 0, 'More than 50 years': 51}}


In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle

--------------------------

# Functoins

In [3]:
def split_answers(data_series, delimiter):
    """
    Split a single string or answer to multiable strings in 
    a list each one representing an answer
    
    INPUT:
        data_series : every series in the df if it is splitable
        delimiter : the semicolin we split on
        
    RETURNS: (pd.Series): If column contains 
    
    """
    def is_splitable(pd_series, delimiter):
        """
        This should return boolean value if the series have a semi colin in it or not (should be splitted) 
        """
        return pd_series.str.contains(delimiter)
    
    def split_answer(pd_series, delimiter):
        """Function to split answers"""
        return pd_series.str.split(delimiter)
    
    #-------------------------------
    
    # Check if multiple answers exist - if none: return original 
    splitable_answers = is_splitable(data_series,delimiter)
    if not splitable_answers.any():
        return data_series
        
    # Else split each value to a list
    modified_answers = split_answer(data_series,delimiter)
    
    # replacing the null values with an empty list
    mask = modified_answers.isnull()
    modified_answers.loc[mask] = modified_answers.loc[mask].apply(lambda x : [])
    
    return modified_answers

# Processing 

## Preprocess Data 

In [4]:
raw_df = pd.read_csv(DATA_PATH)
df = raw_df.copy()

### 1.  Replace Values and parse

In [5]:
for col, replacement in REPLACE_DICT.items():
    df[col] = df[col].replace(replacement).astype(np.float32)

### 2. Split multiple answers

In [6]:
object_cols = df.select_dtypes(include="object").columns.tolist()
for col in object_cols:
    df[col] = split_answers(df[col], ";")

------------------

## Visually verify results 

In [13]:
i = df.sample().index[0]
print(raw_df["DevType"].iloc[i])
print(df["DevType"].iloc[i])

Developer, full-stack;DevOps specialist
['Developer, full-stack', 'DevOps specialist']


In [11]:
i = df.sample(1).index[0]
print(raw_df['YearsCodePro'].iloc[i])
print(df['YearsCodePro'].iloc[i])

10
10.0


# Export Data 

In [9]:
df.to_pickle(EXPORT_PATH)

----------------------