In [54]:
# Constants
DATA_PATH = "../data/raw/survey_results_public.csv"
EXPORT_PATH = "../data/processed/1_preprocessed_df.pkl"

REPLACE_DICT = {
    'YearsCodePro': {'Less than 1 year': 0, 'More than 50 years': 51},
    'YearsCode':    {'Less than 1 year': 0, 'More than 50 years': 51}}

In [3]:
# Needed packages
import pandas as pd
import numpy as np
import logging
import pickle

---
## Functions

In [18]:
def split_answers(data_series, delimiter=';'):
    """
    Split multiple answers exists in a single string into a list
    of strings each represinting a single answer
    
    Parameters:
    * data_series (pd.series): String series with answers
    * delimeter (string): Another decimal integer, defalut to (;)
    
    Returns:
    * (pd.series) if column contains multiple answers
    """
    
    #Sub functions
    def is_splittable(pd_series, delimiter):
        """check if column has multiple answers that could be splitted or not"""
        return pd_series.str.contains(delimiter)
    
    def split_answer(pd_series, delimiter):
        """Function to split single answer"""
        return pd_series.str.split(delimiter)
    
    # --------------------------------------
    
    # Check if multiple answers esist - if not; return original
    splittable_values = is_splittable(data_series, delimiter)
    if not splittable_values.any():
        return data_series
    
    #Else, split each value to a list
    modified_series = split_answer(data_series, delimiter)
    
    #Replace NAs with empty lists
    mask_null = modified_series.isnull()
    modified_series.loc[mask_null] = modified_series.loc[mask_null].apply(lambda x: [])
    
    return modified_series

---
## Processing

In [4]:
# reading data
raw_df = pd.read_csv(DATA_PATH)
df = raw_df.copy()

### 1. Replace values and parse

In [9]:
# Replace the non-numerical values in colmuns into numbers
for col, replacement in REPLACE_DICT.items():
    df[col] = df[col].replace(replacement).astype(np.float32)

### 2. Split multiple answers

In [31]:
object_cols = df.select_dtypes(include="object").columns.tolist()
for col in object_cols:
    df[col] = split_answers(df[col])

---
## Visually verify results

In [40]:
i = df.sample(1).index[0]
print(raw_df['LanguageHaveWorkedWith'].iloc[i])
print(df['LanguageHaveWorkedWith'].iloc[i])

C#;HTML/CSS;JavaScript;SQL
['C#', 'HTML/CSS', 'JavaScript', 'SQL']


In [44]:
i = df.sample(1).index[0]
print(raw_df['DevType'].iloc[i])
print(df['DevType'].iloc[i])

Other (please specify):
['Other (please specify):']


In [53]:
i = df.sample(1).index[0]
print(raw_df['YearsCodePro'].iloc[i])
print(df['YearsCodePro'].iloc[i])

5
5.0


---
# Export Data

In [58]:
df.to_pickle(EXPORT_PATH)