In [1]:
# imports
import pandas as pd
import numpy as np
import pickle

In [6]:
RAW_DATA_PATH = '../data/raw/survey_results_public.csv'

# Constants:
PROCESSED_DATA_PATH = '../data/processed'
TARGET_COL = 'DevType'

with open('../data/variables/possible_predictors.pkl', 'rb') as f:
    PREDICTOR_COLS = pickle.load(f)

### Helper functions:

In [7]:
# functions:
def check_multiple_answers(df, col):
    return df[col].str.contains(';').any() == True

def split_answers(df, col):
    return df[col].str.split(';')      

In [8]:
df_raw = pd.read_csv(RAW_DATA_PATH).loc[:, PREDICTOR_COLS ]
df_raw.sample(3, random_state=42).T


Unnamed: 0,76172,30324,25768
EdLevel,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Some college/university study without earning ...
YearsCode,5,11,4
YearsCodePro,7,9,Less than 1 year
DevType,"Developer, front-end;Developer, desktop or ent...","Developer, mobile;Academic researcher","Developer, desktop or enterprise applications;..."
OrgSize,100 to 499 employees,20 to 99 employees,20 to 99 employees
LanguageHaveWorkedWith,C;C#;PHP,Objective-C;Python;Swift,C#;HTML/CSS;SQL
LanguageWantToWorkWith,C;C#;HTML/CSS;JavaScript;TypeScript,Objective-C;Python;Swift,Bash/Shell;C#;F#;JavaScript;PowerShell;Python;SQL
DatabaseHaveWorkedWith,Firebase,SQLite,Microsoft SQL Server;MySQL
DatabaseWantToWorkWith,DynamoDB;Elasticsearch;Firebase;MongoDB;MySQL;...,SQLite,MariaDB;Microsoft SQL Server;Oracle
PlatformHaveWorkedWith,AWS;DigitalOcean,,


# Preprocessing:

## Data Type conversion
- Data type conversion: `YearsCode` , `YearsCodePro` from object to int16
    
    Issues: 'Less than 1 year' ,'More than 50 years' , nan

In [9]:
df_processed = df_raw.copy()
# replacing 'Less than 1 year' with 0

df_processed.loc[df_processed['YearsCode'] == 'Less than 1 year', 'YearsCode'] = '0'
df_processed.loc[df_processed['YearsCodePro'] == 'Less than 1 year', 'YearsCodePro'] = '0'

# replacing 'More than 50 years' with 51
df_processed.loc[df_processed['YearsCode'] == 'More than 50 years', 'YearsCode'] = '51'
df_processed.loc[df_processed['YearsCodePro'] == 'More than 50 years', 'YearsCodePro'] = '51'


# converting to numeric
df_processed['YearsCode'] = df_processed['YearsCode'].astype(np.float32)
df_processed['YearsCodePro'] = df_processed['YearsCodePro'].astype(np.float32)

In [10]:
df_processed.dtypes

EdLevel                          object
YearsCode                       float32
YearsCodePro                    float32
DevType                          object
OrgSize                          object
LanguageHaveWorkedWith           object
LanguageWantToWorkWith           object
DatabaseHaveWorkedWith           object
DatabaseWantToWorkWith           object
PlatformHaveWorkedWith           object
PlatformWantToWorkWith           object
WebframeHaveWorkedWith           object
WebframeWantToWorkWith           object
MiscTechHaveWorkedWith           object
MiscTechWantToWorkWith           object
ToolsTechHaveWorkedWith          object
ToolsTechWantToWorkWith          object
NEWCollabToolsHaveWorkedWith     object
NEWCollabToolsWantToWorkWith     object
OpSys                            object
dtype: object

## Needed preprocessing:
-  Split multiple answers `;`

In [14]:
cat_cols = df_processed.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
    print(f'Column {col} :')
    if check_multiple_answers(df_processed, col):
        print('Contains multiple answers')
        modified_series = split_answers(df_processed, col)
        mask_null = modified_series.isnull()
        modified_series.loc[mask_null] = modified_series.loc[mask_null].apply(lambda x: [])
        df_processed[col] = modified_series
        print('Splited')
    else:
        print('No multiple answers')
    print(f'\n','---'*5)
print('Done')


Column EdLevel :
No multiple answers

 ---------------
Column DevType :
No multiple answers

 ---------------
Column OrgSize :
No multiple answers

 ---------------
Column LanguageHaveWorkedWith :
No multiple answers

 ---------------
Column LanguageWantToWorkWith :
No multiple answers

 ---------------
Column DatabaseHaveWorkedWith :
No multiple answers

 ---------------
Column DatabaseWantToWorkWith :
No multiple answers

 ---------------
Column PlatformHaveWorkedWith :
No multiple answers

 ---------------
Column PlatformWantToWorkWith :
No multiple answers

 ---------------
Column WebframeHaveWorkedWith :
No multiple answers

 ---------------
Column WebframeWantToWorkWith :
No multiple answers

 ---------------
Column MiscTechHaveWorkedWith :
No multiple answers

 ---------------
Column MiscTechWantToWorkWith :
No multiple answers

 ---------------
Column ToolsTechHaveWorkedWith :
No multiple answers

 ---------------
Column ToolsTechWantToWorkWith :
No multiple answers

 ---------

##### Verifying results

In [15]:
index = df_processed.sample(3).index
display(
    df_raw.loc[index, :] , 
    df_processed.loc[index, :]
)

Unnamed: 0,EdLevel,YearsCode,YearsCodePro,DevType,OrgSize,LanguageHaveWorkedWith,LanguageWantToWorkWith,DatabaseHaveWorkedWith,DatabaseWantToWorkWith,PlatformHaveWorkedWith,PlatformWantToWorkWith,WebframeHaveWorkedWith,WebframeWantToWorkWith,MiscTechHaveWorkedWith,MiscTechWantToWorkWith,ToolsTechHaveWorkedWith,ToolsTechWantToWorkWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsWantToWorkWith,OpSys
61918,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",25,20.0,"Developer, desktop or enterprise applications;...",100 to 499 employees,HTML/CSS;JavaScript;Python;Ruby,Elixir;Go;HTML/CSS;Julia;Ruby,PostgreSQL;Redis,Cassandra;Elasticsearch;PostgreSQL;Redis,AWS;DigitalOcean,AWS;DigitalOcean;Google Cloud Platform,Flask;Vue.js,Flask;Svelte;Vue.js,,,Docker;Git,Docker;Git;Kubernetes;Terraform,Vim,IPython/Jupyter;Neovim;Vim,MacOS
12027,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",10,,Student,,C;C++;Java;Python,C;C++;Python,,,,,,,,,Git,Git,IntelliJ;Vim;Visual Studio;Visual Studio Code,Vim;Visual Studio Code,Windows
68485,Some college/university study without earning ...,6,,,,C;C#;Delphi;HTML/CSS;Java;Matlab;Python;SQL,C;C#;Matlab;Python,MySQL,,,,,,.NET Framework;NumPy;Qt,,Git;Unity 3D,Git;Unity 3D,Eclipse;Notepad++;Visual Studio;Visual Studio ...,Eclipse;Notepad++;Visual Studio;Visual Studio ...,Windows


Unnamed: 0,EdLevel,YearsCode,YearsCodePro,DevType,OrgSize,LanguageHaveWorkedWith,LanguageWantToWorkWith,DatabaseHaveWorkedWith,DatabaseWantToWorkWith,PlatformHaveWorkedWith,PlatformWantToWorkWith,WebframeHaveWorkedWith,WebframeWantToWorkWith,MiscTechHaveWorkedWith,MiscTechWantToWorkWith,ToolsTechHaveWorkedWith,ToolsTechWantToWorkWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsWantToWorkWith,OpSys
61918,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",25.0,20.0,"[Developer, desktop or enterprise applications...",100 to 499 employees,"[HTML/CSS, JavaScript, Python, Ruby]","[Elixir, Go, HTML/CSS, Julia, Ruby]","[PostgreSQL, Redis]","[Cassandra, Elasticsearch, PostgreSQL, Redis]","[AWS, DigitalOcean]","[AWS, DigitalOcean, Google Cloud Platform]","[Flask, Vue.js]","[Flask, Svelte, Vue.js]",[],[],"[Docker, Git]","[Docker, Git, Kubernetes, Terraform]",[Vim],"[IPython/Jupyter, Neovim, Vim]",MacOS
12027,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",10.0,,[Student],,"[C, C++, Java, Python]","[C, C++, Python]",[],[],[],[],[],[],[],[],[Git],[Git],"[IntelliJ, Vim, Visual Studio, Visual Studio C...","[Vim, Visual Studio Code]",Windows
68485,Some college/university study without earning ...,6.0,,[],,"[C, C#, Delphi, HTML/CSS, Java, Matlab, Python...","[C, C#, Matlab, Python]",[MySQL],[],[],[],[],[],"[.NET Framework, NumPy, Qt]",[],"[Git, Unity 3D]","[Git, Unity 3D]","[Eclipse, Notepad++, Visual Studio, Visual Stu...","[Eclipse, Notepad++, Visual Studio, Visual Stu...",Windows


### Export Data

In [16]:
df_processed.to_pickle(f'{PROCESSED_DATA_PATH}/1_preprocessed_df.pkl')