# **Data Preprocessing**

In [1]:
# Constants
DATA_PATH = "../data/raw/survey_results_public.csv"
EXPORT_PATH = "../data/processed/1_preprocessed_df.pkl"

REPLACE_DICT = {
    "YearsCode": {"Less than 1 year": 0, "More than 50 years": 51},
    "YearsCodePro":{'Less than 1 year':0 , "More than 50 years":51}
}

In [2]:
# Load Packages
import pandas as pd 
import numpy as np 
import logging
import pickle

## **Functions**

In [3]:
def split_answers(data_series , delimiter = ';'):
    # we will return pd.Series if colummns contain answers have ;
    
    # Sub functions - splittable function 
    def is_splittable(pd_series , delimiter):
        """ Here we will check if mulitple should be splitted - return boolen (True - False)"""
        return pd_series.str.contains(delimiter)
    
    # Function for single functions 
    def split_answer(pd_series , delimiter):
        return pd_series.str.split(delimiter)
    
    # Checking if mulitple answers Exist - if none : return normal answer or Split each value to list 
    splittable_values = is_splittable(data_series , delimiter)
    if not splittable_values.any():
        return data_series
    modified_series = split_answer(data_series , delimiter)
    
    # Replace NAs with eampty list 
    mask_null = modified_series.isnull()
    modified_series.loc[mask_null] = modified_series.loc[mask_null].apply(lambda x : [])
    
    return modified_series
        

# **Processing**

## Processing Data - Main

In [4]:
raw_df = pd.read_csv(DATA_PATH)
df = raw_df.copy()

### 1.  Replace Values and parse

In [5]:
for col , replacement in REPLACE_DICT.items():
    df[col] = df[col].replace(replacement).astype(np.float32)

### 2. Split Multiple Answers

In [6]:
object_col = df.select_dtypes(include='object').columns.tolist()
for col in object_col:
    df[col] = split_answers(df[col])

---------------------------------------

## Testing & Visually verify results

In [23]:
i = df.sample(1).index[0]
print("Original Data of (LanguageHaveWorkedWith): ")
#print("---------------------")
print(raw_df['LanguageHaveWorkedWith'].iloc[i])
print("----------------------")
print('proceed Data of (LanguageHaveWorkedWith): ')
#print("---------------------")
print(df['LanguageHaveWorkedWith'].iloc[i])

Original Data of (LanguageHaveWorkedWith): 
Bash/Shell;HTML/CSS;Java;JavaScript;Kotlin;Node.js;PHP;Python;SQL;TypeScript
----------------------
proceed Data of (LanguageHaveWorkedWith): 
['Bash/Shell', 'HTML/CSS', 'Java', 'JavaScript', 'Kotlin', 'Node.js', 'PHP', 'Python', 'SQL', 'TypeScript']


In [27]:
d = df.sample(1).index[0]
print('Original Data of (DevType): ')
print (raw_df['DevType'].iloc[d])
print("----------------------------")
print("Proceed Data of (DevType): ")
print(df['DevType'].iloc[d])

Original Data of (DevType): 
Developer, full-stack;Developer, QA or test;Developer, game or graphics
----------------------------
Proceed Data of (DevType): 
['Developer, full-stack', 'Developer, QA or test', 'Developer, game or graphics']


In [56]:
y = df.sample(1).index[0]
print('Original Data of (YearsCodePro): ')
print(raw_df['YearsCodePro'].iloc[y])
print("----------------------------")
print("Proceed Data of (YearsCodePro): ")
print(df['YearsCodePro'].iloc[y])

Original Data of (YearsCodePro): 
2
----------------------------
Proceed Data of (YearsCodePro): 
2.0


# **EXPORT DATA**

In [57]:
# We will export it as pickle file to directory we specify 
df.to_pickle(EXPORT_PATH)