In [184]:
import re
import numpy as np
import pandas as pd
import pickle

In [185]:
pd.options.display.max_columns=None
pd.options.display.max_rows=100

In [186]:
df_unido = pd.read_csv('../datos/datos_unidos_raw.csv', index_col = 0)

## Limpieza

In [187]:
def limpiar_espacios(columna):  
    try:

        patron_espacios = "\s*,\S+"
        return re.sub(patron_espacios, ", ", columna)

    except:
        return np.nan

In [188]:
def separar_comas(columna):
    try:
        patron_comas = '(,\s)a'
        return re.sub(patron_comas, "; a", columna)
    except:
        return np.nan

In [189]:
df_unido['Q24'] = df_unido['Q24'].apply(separar_comas)

In [190]:
def separar_comas2(columna):
    try:
        patron = r'(\(.*),(\s.*),(\s.*\))'
        return re.sub(patron, r'\1;\2;\3', columna)
    except:
        return np.nan


In [191]:
df_unido['Q9'] = df_unido['Q9'].apply(separar_comas2)

## Limpieza: Eliminamos columnas

In [192]:
def eliminar_etiqueta (dataframe, *lista_columna):
    #función para eliminar las columnas en las que no nos vamos a centrar en el estudio, por etiqueta/nombre de columna, 
    #los parámetros que coge son dataframe y una lista de nombres de las columnas de ese dataframe
    for columna in dataframe.columns:
        for elemento  in lista_columna:
            if elemento == columna:
                #esta función itera por nuestra lista_columna (el arg) y comprueba si el elemento (la etiqueta a eliminar)
                # se encuentra en cada columna, y, de ser así, la eliminará usando el método .drop()
                dataframe.drop([columna],axis = 1, inplace = True)
    return dataframe.columns

In [193]:
# eliminamos columnas sueltas
eliminar_etiqueta(df_unido, *['level_0', 'index', 'time', 'Q4', 'Q11', 'Q12', 'Q13', 'Q16', 'Q17', 'Q20', 'Q21'])

Index(['age', 'gender', 'Q3', 'Q5', 'Q6', 'Q8', 'Q15', 'Q22', 'Q23', 'Q25',
       ...
       'Q38_B_Part_3', 'Q38_B_Part_4', 'Q38_B_Part_5', 'Q38_B_Part_6',
       'Q38_B_Part_7', 'Q38_B_Part_8', 'Q38_B_Part_9', 'Q38_B_Part_10',
       'Q38_B_Part_11', 'Q38_B_OTHER'],
      dtype='object', length=249)

In [194]:
# función para eliminar las columnas en las que no nos vamos a centrar en el estudio, por rango 
def eliminar_indices (dataframe, columna1, columna2):
    #creamos un diccionario de los índices por columna de nuestro df
    diccionario = {}
    #los nombres de las columnas serán nuestras keys para poder acceder a los índices usando los nombres de las columnas (los parámetros)
    for indice, col in enumerate(df_unido.columns):
        diccionario.update({col: indice})
    print(diccionario)
    #usamos el diccionario para obtener los índices en el rango 
    if columna1 in diccionario:
        indice1 = diccionario.get(columna1)
    else: 
        print('Columna1 no está en el diccionario')
    if columna2 in diccionario:
    #usamos el +1 para poder acceder al último índice
        indice2 = diccionario.get(columna2) + 1
    else: 
        print('Columna2 no está en el diccionario')
        #usamos los índices para borrar el rango de columnas y así eliminiar varias a la vez
    for elemento in dataframe.columns[indice1:indice2]:    
        dataframe.drop([elemento],axis = 1, inplace = True)
    return dataframe.columns

Con la función nos vamos deshaciendo en rangos de las columnas que no nos aportan información con valor. 

In [195]:
eliminar_indices(df_unido, 'Q23', 'Q35')

{'age': 0, 'gender': 1, 'Q3': 2, 'Q5': 3, 'Q6': 4, 'Q8': 5, 'Q15': 6, 'Q22': 7, 'Q23': 8, 'Q25': 9, 'Q26': 10, 'Q33': 11, 'Q35': 12, 'Q41': 13, 'Q7': 14, 'Q9': 15, 'Q14': 16, 'Q24': 17, 'Q32': 18, 'Q34': 19, 'Q10_Part_1': 20, 'Q10_Part_2': 21, 'Q10_Part_3': 22, 'Q10_Part_4': 23, 'Q10_Part_5': 24, 'Q10_Part_6': 25, 'Q10_Part_7': 26, 'Q10_Part_8': 27, 'Q10_Part_9': 28, 'Q10_Part_10': 29, 'Q10_Part_11': 30, 'Q10_Part_12': 31, 'Q10_Part_13': 32, 'Q10_Part_14': 33, 'Q10_Part_15': 34, 'Q10_Part_16': 35, 'Q10_OTHER': 36, 'Q18_Part_1': 37, 'Q18_Part_2': 38, 'Q18_Part_3': 39, 'Q18_Part_4': 40, 'Q18_Part_5': 41, 'Q18_Part_6': 42, 'Q18_OTHER': 43, 'Q19_Part_1': 44, 'Q19_Part_2': 45, 'Q19_Part_3': 46, 'Q19_Part_4': 47, 'Q19_Part_5': 48, 'Q19_OTHER': 49, 'Q27_A_Part_1': 50, 'Q27_A_Part_2': 51, 'Q27_A_Part_3': 52, 'Q27_A_Part_4': 53, 'Q27_A_Part_5': 54, 'Q27_A_Part_6': 55, 'Q27_A_Part_7': 56, 'Q27_A_Part_8': 57, 'Q27_A_Part_9': 58, 'Q27_A_Part_10': 59, 'Q27_A_Part_11': 60, 'Q27_A_OTHER': 61, 'Q28': 

Index(['age', 'gender', 'Q3', 'Q5', 'Q6', 'Q8', 'Q15', 'Q22', 'Q41', 'Q7',
       ...
       'Q38_B_Part_3', 'Q38_B_Part_4', 'Q38_B_Part_5', 'Q38_B_Part_6',
       'Q38_B_Part_7', 'Q38_B_Part_8', 'Q38_B_Part_9', 'Q38_B_Part_10',
       'Q38_B_Part_11', 'Q38_B_OTHER'],
      dtype='object', length=244)

In [196]:
eliminar_indices(df_unido, 'Q18_Part_1', 'Q38_A_OTHER')

{'age': 0, 'gender': 1, 'Q3': 2, 'Q5': 3, 'Q6': 4, 'Q8': 5, 'Q15': 6, 'Q22': 7, 'Q41': 8, 'Q7': 9, 'Q9': 10, 'Q14': 11, 'Q24': 12, 'Q32': 13, 'Q34': 14, 'Q10_Part_1': 15, 'Q10_Part_2': 16, 'Q10_Part_3': 17, 'Q10_Part_4': 18, 'Q10_Part_5': 19, 'Q10_Part_6': 20, 'Q10_Part_7': 21, 'Q10_Part_8': 22, 'Q10_Part_9': 23, 'Q10_Part_10': 24, 'Q10_Part_11': 25, 'Q10_Part_12': 26, 'Q10_Part_13': 27, 'Q10_Part_14': 28, 'Q10_Part_15': 29, 'Q10_Part_16': 30, 'Q10_OTHER': 31, 'Q18_Part_1': 32, 'Q18_Part_2': 33, 'Q18_Part_3': 34, 'Q18_Part_4': 35, 'Q18_Part_5': 36, 'Q18_Part_6': 37, 'Q18_OTHER': 38, 'Q19_Part_1': 39, 'Q19_Part_2': 40, 'Q19_Part_3': 41, 'Q19_Part_4': 42, 'Q19_Part_5': 43, 'Q19_OTHER': 44, 'Q27_A_Part_1': 45, 'Q27_A_Part_2': 46, 'Q27_A_Part_3': 47, 'Q27_A_Part_4': 48, 'Q27_A_Part_5': 49, 'Q27_A_Part_6': 50, 'Q27_A_Part_7': 51, 'Q27_A_Part_8': 52, 'Q27_A_Part_9': 53, 'Q27_A_Part_10': 54, 'Q27_A_Part_11': 55, 'Q27_A_OTHER': 56, 'Q28': 57, 'Q29_A_Part_1': 58, 'Q29_A_Part_2': 59, 'Q29_A_Part

Index(['age', 'gender', 'Q3', 'Q5', 'Q6', 'Q8', 'Q15', 'Q22', 'Q41', 'Q7',
       ...
       'Q38_B_Part_3', 'Q38_B_Part_4', 'Q38_B_Part_5', 'Q38_B_Part_6',
       'Q38_B_Part_7', 'Q38_B_Part_8', 'Q38_B_Part_9', 'Q38_B_Part_10',
       'Q38_B_Part_11', 'Q38_B_OTHER'],
      dtype='object', length=167)

In [197]:
eliminar_indices(df_unido, 'Q40_Part_1', 'Q38_B_OTHER')

{'age': 0, 'gender': 1, 'Q3': 2, 'Q5': 3, 'Q6': 4, 'Q8': 5, 'Q15': 6, 'Q22': 7, 'Q41': 8, 'Q7': 9, 'Q9': 10, 'Q14': 11, 'Q24': 12, 'Q32': 13, 'Q34': 14, 'Q10_Part_1': 15, 'Q10_Part_2': 16, 'Q10_Part_3': 17, 'Q10_Part_4': 18, 'Q10_Part_5': 19, 'Q10_Part_6': 20, 'Q10_Part_7': 21, 'Q10_Part_8': 22, 'Q10_Part_9': 23, 'Q10_Part_10': 24, 'Q10_Part_11': 25, 'Q10_Part_12': 26, 'Q10_Part_13': 27, 'Q10_Part_14': 28, 'Q10_Part_15': 29, 'Q10_Part_16': 30, 'Q10_OTHER': 31, 'Q39_Part_1': 32, 'Q39_Part_2': 33, 'Q39_Part_3': 34, 'Q39_Part_4': 35, 'Q39_Part_5': 36, 'Q39_Part_6': 37, 'Q39_Part_7': 38, 'Q39_Part_8': 39, 'Q39_Part_9': 40, 'Q39_OTHER': 41, 'Q40_Part_1': 42, 'Q40_Part_2': 43, 'Q40_Part_3': 44, 'Q40_Part_4': 45, 'Q40_Part_5': 46, 'Q40_Part_6': 47, 'Q40_Part_7': 48, 'Q40_Part_8': 49, 'Q40_Part_9': 50, 'Q40_Part_10': 51, 'Q40_Part_11': 52, 'Q40_OTHER': 53, 'Q42_Part_1': 54, 'Q42_Part_2': 55, 'Q42_Part_3': 56, 'Q42_Part_4': 57, 'Q42_Part_5': 58, 'Q42_Part_6': 59, 'Q42_Part_7': 60, 'Q42_Part_8':

Index(['age', 'gender', 'Q3', 'Q5', 'Q6', 'Q8', 'Q15', 'Q22', 'Q41', 'Q7',
       'Q9', 'Q14', 'Q24', 'Q32', 'Q34', 'Q10_Part_1', 'Q10_Part_2',
       'Q10_Part_3', 'Q10_Part_4', 'Q10_Part_5', 'Q10_Part_6', 'Q10_Part_7',
       'Q10_Part_8', 'Q10_Part_9', 'Q10_Part_10', 'Q10_Part_11', 'Q10_Part_12',
       'Q10_Part_13', 'Q10_Part_14', 'Q10_Part_15', 'Q10_Part_16', 'Q10_OTHER',
       'Q39_Part_1', 'Q39_Part_2', 'Q39_Part_3', 'Q39_Part_4', 'Q39_Part_5',
       'Q39_Part_6', 'Q39_Part_7', 'Q39_Part_8', 'Q39_Part_9', 'Q39_OTHER'],
      dtype='object')

In [198]:
for i in df_unido.columns:
    print(i)

age
gender
Q3
Q5
Q6
Q8
Q15
Q22
Q41
Q7
Q9
Q14
Q24
Q32
Q34
Q10_Part_1
Q10_Part_2
Q10_Part_3
Q10_Part_4
Q10_Part_5
Q10_Part_6
Q10_Part_7
Q10_Part_8
Q10_Part_9
Q10_Part_10
Q10_Part_11
Q10_Part_12
Q10_Part_13
Q10_Part_14
Q10_Part_15
Q10_Part_16
Q10_OTHER
Q39_Part_1
Q39_Part_2
Q39_Part_3
Q39_Part_4
Q39_Part_5
Q39_Part_6
Q39_Part_7
Q39_Part_8
Q39_Part_9
Q39_OTHER


## Cambiar nombres de las columnas

In [199]:
df_limpio = df_unido.copy()

In [200]:
diccionario_nombres = {'age':'age',
 'gender':'gender',
 'Q3':'country',
 'Q5':'job_title',
 'Q6':'years_programming',
 'Q7':'dev_language',
 'Q8':'first_language_rec',
 'Q9':'IDE',
 'Q10_Part_1':'notebooks_KaggleNotebooks',
 'Q10_Part_2':'notebooks_ColabNotebooks',
 'Q10_Part_3':'notebooks_AzureNotebooks',
 'Q10_Part_4':'notebooks_Paperspace/Gradient',
 'Q10_Part_5':'notebooks_Binder/JupyterHub',
 'Q10_Part_6':'notebooks_CodeOcean',
 'Q10_Part_7':'notebooks_IBMWatsonStudio',
 'Q10_Part_8':'notebooks_AmazonSagemakerStudioNotebooks',
 'Q10_Part_9':'notebooks_AmazonEMRNotebooks',
 'Q10_Part_10':'notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI)',
 'Q10_Part_11':'notebooks_GoogleCloudDatalab',
 'Q10_Part_12':'notebooks_DatabricksCollaborativeNotebooks',
 'Q10_Part_13':'notebooks_Zeppelin/ZeplNotebooks',
 'Q10_Part_14':'notebooks_DeepnoteNotebooks',
 'Q10_Part_15':'notebooks_ObservableNotebooks',
 'Q10_Part_16':'notebooks_None',
 'Q10_OTHER':'notebooks_Other',
 'Q14':'visualisation',
 'Q41':'primary_data_tool',
 'Q15':'ML',
 'Q22':'size_DA_dept',
 'Q24':'work_activities',
 'Q32':'big_data',
 'Q34':'BI_tools',
 'Q39_Part_1':'sharing_PlotlyDash',
 'Q39_Part_2':'sharing_Streamlit',
 'Q39_Part_3':'sharing_NBViewer',
 'Q39_Part_4':'sharing_GitHub',
 'Q39_Part_5':'sharing_Personalblog',
 'Q39_Part_6':'sharing_Kaggle',
 'Q39_Part_7':'sharing_Colab',
 'Q39_Part_8':'sharing_Shiny',
 'Q39_Part_9':'sharing_does_not_share',
 'Q39_OTHER':'sharing_Other'}

In [201]:
df_limpio.rename(columns=diccionario_nombres, inplace=True)
df_limpio.head(2)

Unnamed: 0,age,gender,country,job_title,years_programming,first_language_rec,ML,size_DA_dept,primary_data_tool,dev_language,IDE,visualisation,work_activities,big_data,BI_tools,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other
0,50-54,Man,India,Other,5-10 years,Python,5-10 years,3-4,"Local development environments (RStudio, Jupyt...","Python, R",Vim / Emacs,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",None of these activities are an important par...,PostgreSQL,,,Colab Notebooks,,,,,,,,,,,,,,,,,,,GitHub,,Kaggle,,,,
1,50-54,Man,Indonesia,Program/Project Manager,20+ years,Python,< 1 year,1-2,"Advanced statistical software (SPSS, SAS, etc.)","SQL, C, C++, Java","Notepad++,Jupyter Notebook",Matplotlib,Build and/or run the data infrastructure that...,,,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,


## Creamos columnas de categorias

In [202]:
lista_nombres_nuevos = ['age', 'gender', 'country', 'job_title', 'years_programming', 'dev_language', 'first_language_rec', 'IDE', 'notebooks', 'visualisation', 'ML', 'size_DA_dept', 'work_activities', 'big_data', 'BI_tools', 'sharing']

In [203]:
dicc_continentes = {'Europe': ['Greece', 'Belgium', 'Poland', 'Italy', 'Spain', 'United Kingdom of Great Britain and Northern Ireland', 'France',
                            'Switzerland', 'Sweden', 'Netherlands', 'Ukraine', 'Romania', 'Austria', 'Belarus', 'Ireland',
                            'Portugal', 'Denmark', 'Germany', 'Norway', 'Czech Republic'],
                    'Asia & Oceania': ['Australia', 'India', 'Indonesia', 'Pakistan', 'Russia', 'Turkey', 'Japan', 'Singapore', 'China', 'Iran, Islamic Republic of...', 
                             'Viet Nam', 'Israel', 'Bangladesh', 'Saudi Arabia', 'Taiwan', 'Hong Kong (S.A.R.)', 'South Korea', 'Philippines', 'Sri Lanka', 
                             'United Arab Emirates', 'Malaysia', 'Thailand', 'Nepal', 'Kazakhstan', 'Iraq'],
                    'America': ['Mexico', 'Brazil', 'United States of America',
                                'Peru', 'Argentina', 'Colombia', 'Canada', 'Chile', 'Ecuador'],
                    'Africa':['Uganda', 'Ghana','Algeria', 'Tunisia', 'South Africa', 'Nigeria', 'Kenya','Egypt', 'Ethiopia', 'Morocco'],
                    'Other' : ['Other', 'I do not wish to disclose my location']}

In [204]:
def categoria(col, dicc):
    try:
        for key, value in dicc.items():
                if col in value:
                    return key
                else:
                    pass
    except:
        pass
            

In [205]:
df_limpio['continent'] = df_limpio.apply(lambda x: categoria(x['country'], dicc_continentes), axis=1)

In [206]:
dicc_experiencia = {'Sin experiencia':['I have never written code'],
                    'Junior':['< 1 years', '1-3 years'],
                    'Senior': ['3-5 years', '5-10 years', '10-20 years', '20+ years']}

In [207]:
df_limpio['programming_experience'] = df_limpio.apply(lambda x: categoria(x['years_programming'], dicc_experiencia), axis=1)

In [208]:
dicc_dept_size = {'small':['0', '1-2', '3-4'],
                  'medium':['5-9', '10-14'],
                  'large':['15-19', '20+']}

In [209]:
df_limpio['dept size'] = df_limpio.apply(lambda x: categoria(x['size_DA_dept'], dicc_dept_size), axis=1)

In [210]:
df_limpio['ML'].value_counts()

< 1 year                                 9163
1-2 years                                4675
I do not use machine learning methods    3889
2-3 years                                2305
3-4 years                                1171
5-10 years                               1033
4-5 years                                 945
10-20 years                               362
20 or more years                          211
Name: ML, dtype: int64

In [211]:
df_limpio['age'].value_counts()

25-29    4931
18-21    4901
22-24    4694
30-34    3441
35-39    2504
40-44    1890
45-49    1375
50-54     964
55-59     592
60-69     553
70+       128
Name: age, dtype: int64

In [212]:
dicc_ML = {'None': ['I do not use machine learning methods'],
           '< 2 years': ['< 1 year', '1-2 years'],
           '2-5 years': ['2-3 years', '3-4 years', '4-5 years'],
           '5+ years': ['5-10 years', '10-20 years', '20 or more years']}

In [213]:
df_limpio['ML_experience'] = df_limpio.apply(lambda x: categoria(x['ML'], dicc_ML), axis=1)

In [214]:
dicc_edad = {'18-29':['18-21', '22-24', '25-29'],
             '30-39':['30-34', '35-39'],
             '40-49':['40-44', '45-49'],
             '50-59':['50-54', '55-59'],
             '60+':['60-69', '70+']}

In [215]:
df_limpio['age'] = df_limpio.apply(lambda x: categoria(x['age'], dicc_edad), axis=1)

In [216]:
df_limpio.head(3)

Unnamed: 0,age,gender,country,job_title,years_programming,first_language_rec,ML,size_DA_dept,primary_data_tool,dev_language,IDE,visualisation,work_activities,big_data,BI_tools,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other,continent,programming_experience,dept size,ML_experience
0,50-59,Man,India,Other,5-10 years,Python,5-10 years,3-4,"Local development environments (RStudio, Jupyt...","Python, R",Vim / Emacs,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",None of these activities are an important par...,PostgreSQL,,,Colab Notebooks,,,,,,,,,,,,,,,,,,,GitHub,,Kaggle,,,,,Asia & Oceania,Senior,small,5+ years
1,50-59,Man,Indonesia,Program/Project Manager,20+ years,Python,< 1 year,1-2,"Advanced statistical software (SPSS, SAS, etc.)","SQL, C, C++, Java","Notepad++,Jupyter Notebook",Matplotlib,Build and/or run the data infrastructure that...,,,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,,Asia & Oceania,Senior,small,< 2 years
2,18-29,Man,Pakistan,Software Engineer,1-3 years,Python,I do not use machine learning methods,0,"Basic statistical software (Microsoft Excel, G...","Python, C++, Java","PyCharm ,Jupyter Notebook, Other",Matplotlib,None of these activities are an important par...,"MySQL , MongoDB",,Kaggle Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,I do not share my work publicly,,Asia & Oceania,Junior,small,


## Reordenamos las columnas

In [217]:
df_limpio.columns

Index(['age', 'gender', 'country', 'job_title', 'years_programming',
       'first_language_rec', 'ML', 'size_DA_dept', 'primary_data_tool',
       'dev_language', 'IDE', 'visualisation', 'work_activities', 'big_data',
       'BI_tools', 'notebooks_KaggleNotebooks', 'notebooks_ColabNotebooks',
       'notebooks_AzureNotebooks', 'notebooks_Paperspace/Gradient',
       'notebooks_Binder/JupyterHub', 'notebooks_CodeOcean',
       'notebooks_IBMWatsonStudio', 'notebooks_AmazonSagemakerStudioNotebooks',
       'notebooks_AmazonEMRNotebooks',
       'notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI)',
       'notebooks_GoogleCloudDatalab',
       'notebooks_DatabricksCollaborativeNotebooks',
       'notebooks_Zeppelin/ZeplNotebooks', 'notebooks_DeepnoteNotebooks',
       'notebooks_ObservableNotebooks', 'notebooks_None', 'notebooks_Other',
       'sharing_PlotlyDash', 'sharing_Streamlit', 'sharing_NBViewer',
       'sharing_GitHub', 'sharing_Personalblog', 'sharing_Kaggle',
       'sharing

In [218]:
nuevo_orden = ['age',
 'gender',
 'country',
 'continent',
 'job_title',
 'work_activities',
  'size_DA_dept',
  'dept size',
 'years_programming',
 'programming_experience',
 'first_language_rec',
 'dev_language',
 'primary_data_tool',
 'IDE',
 'big_data',
 'BI_tools',
 'visualisation',
 'ML',
 'ML_experience',
 'notebooks_KaggleNotebooks',
 'notebooks_ColabNotebooks',
 'notebooks_AzureNotebooks',
 'notebooks_Paperspace/Gradient',
 'notebooks_Binder/JupyterHub',
 'notebooks_CodeOcean',
 'notebooks_IBMWatsonStudio',
 'notebooks_AmazonSagemakerStudioNotebooks',
 'notebooks_AmazonEMRNotebooks',
 'notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI)',
 'notebooks_GoogleCloudDatalab',
 'notebooks_DatabricksCollaborativeNotebooks',
 'notebooks_Zeppelin/ZeplNotebooks',
 'notebooks_DeepnoteNotebooks',
 'notebooks_ObservableNotebooks',
 'notebooks_None',
 'notebooks_Other',
 'sharing_PlotlyDash',
 'sharing_Streamlit',
 'sharing_NBViewer',
 'sharing_GitHub',
 'sharing_Personalblog',
 'sharing_Kaggle',
 'sharing_Colab',
 'sharing_Shiny',
 'sharing_does_not_share',
 'sharing_Other'
 ]

In [219]:
df_limpio = df_limpio.reindex(columns=nuevo_orden)
df_limpio.head(1)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,first_language_rec,dev_language,primary_data_tool,IDE,big_data,BI_tools,visualisation,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other
0,50-59,Man,India,Asia & Oceania,Other,None of these activities are an important par...,3-4,small,5-10 years,Senior,Python,"Python, R","Local development environments (RStudio, Jupyt...",Vim / Emacs,PostgreSQL,,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",5-10 years,5+ years,,Colab Notebooks,,,,,,,,,,,,,,,,,,,GitHub,,Kaggle,,,,


In [220]:
df_limpio.to_csv('../datos/datos_limpios.csv')

# Juntamos las columnas partidas

In [221]:
df_limpio = pd.read_csv('../datos/datos_limpios.csv', index_col=0)

In [222]:
for i, col in enumerate(df_junto.columns):
    print(i, col)

0 age
1 gender
2 country
3 continent
4 job_title
5 work_activities
6 size_DA_dept
7 dept size
8 years_programming
9 programming_experience
10 primary_data_tool
11 first_language_rec
12 dev_language
13 IDE
14 big_data
15 BI_tools
16 visualisation
17 notebooks
18 sharing
19 ML
20 ML_experience


In [223]:
def juntar_columnas(dataframe, columna_inicio, columna_final, nueva_columna):
    
    diccionario = {}

    for i, f in dataframe.loc[:, columna_inicio:columna_final].iterrows():
        diccionario[i] = []
        
        for e in f:
            if type(e) == str:
                diccionario[i].append(e)
    
    dataframe[nueva_columna] = diccionario.values()
    dataframe[nueva_columna] = dataframe[nueva_columna].apply(lambda y: np.nan if len(y)==0 else ','.join(y))

In [224]:
juntar_columnas(df_limpio, 'notebooks_KaggleNotebooks', 'notebooks_Other', 'notebooks')

In [225]:
juntar_columnas(df_limpio, 'sharing_PlotlyDash', 'sharing_Other', 'sharing')

In [226]:
print(df_limpio.columns.tolist())

['age', 'gender', 'country', 'continent', 'job_title', 'work_activities', 'size_DA_dept', 'dept size', 'years_programming', 'programming_experience', 'first_language_rec', 'dev_language', 'primary_data_tool', 'IDE', 'big_data', 'BI_tools', 'visualisation', 'ML', 'ML_experience', 'notebooks_KaggleNotebooks', 'notebooks_ColabNotebooks', 'notebooks_AzureNotebooks', 'notebooks_Paperspace/Gradient', 'notebooks_Binder/JupyterHub', 'notebooks_CodeOcean', 'notebooks_IBMWatsonStudio', 'notebooks_AmazonSagemakerStudioNotebooks', 'notebooks_AmazonEMRNotebooks', 'notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI)', 'notebooks_GoogleCloudDatalab', 'notebooks_DatabricksCollaborativeNotebooks', 'notebooks_Zeppelin/ZeplNotebooks', 'notebooks_DeepnoteNotebooks', 'notebooks_ObservableNotebooks', 'notebooks_None', 'notebooks_Other', 'sharing_PlotlyDash', 'sharing_Streamlit', 'sharing_NBViewer', 'sharing_GitHub', 'sharing_Personalblog', 'sharing_Kaggle', 'sharing_Colab', 'sharing_Shiny', 'sharing_does_no

In [227]:
nuevo_orden2 = ['age',
 'gender',
 'country',
 'continent',
 'job_title',
 'work_activities',
  'size_DA_dept',
  'dept size',
 'years_programming',
 'programming_experience',
 'primary_data_tool',
 'first_language_rec',
 'dev_language',
 'IDE',
 'big_data',
 'BI_tools',
 'visualisation',
 'notebooks', 
 'sharing',
 'ML',
 'ML_experience',
 'notebooks_KaggleNotebooks',
 'notebooks_ColabNotebooks',
 'notebooks_AzureNotebooks',
 'notebooks_Paperspace/Gradient',
 'notebooks_Binder/JupyterHub',
 'notebooks_CodeOcean',
 'notebooks_IBMWatsonStudio',
 'notebooks_AmazonSagemakerStudioNotebooks',
 'notebooks_AmazonEMRNotebooks',
 'notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI)',
 'notebooks_GoogleCloudDatalab',
 'notebooks_DatabricksCollaborativeNotebooks',
 'notebooks_Zeppelin/ZeplNotebooks',
 'notebooks_DeepnoteNotebooks',
 'notebooks_ObservableNotebooks',
 'notebooks_None',
 'notebooks_Other',
 'sharing_PlotlyDash',
 'sharing_Streamlit',
 'sharing_NBViewer',
 'sharing_GitHub',
 'sharing_Personalblog',
 'sharing_Kaggle',
 'sharing_Colab',
 'sharing_Shiny',
 'sharing_does_not_share',
 'sharing_Other'
 ]

In [228]:
df_limpio = df_limpio.reindex(columns=nuevo_orden2)
df_limpio.head(1)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other
0,50-59,Man,India,Asia & Oceania,Other,None of these activities are an important par...,3-4,small,5-10 years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, R",Vim / Emacs,PostgreSQL,,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",Colab Notebooks,"GitHub , Kaggle",5-10 years,5+ years,,Colab Notebooks,,,,,,,,,,,,,,,,,,,GitHub,,Kaggle,,,,


In [229]:
df_limpio.to_csv('../datos/datos_limpios.csv')

In [230]:
df_junto = df_limpio.drop(columns = ['notebooks_KaggleNotebooks', 
                          'notebooks_ColabNotebooks', 
                          'notebooks_AzureNotebooks', 
                          'notebooks_Paperspace/Gradient', 
                          'notebooks_Binder/JupyterHub', 
                          'notebooks_CodeOcean', 
                          'notebooks_IBMWatsonStudio', 
                          'notebooks_AmazonSagemakerStudioNotebooks', 
                          'notebooks_AmazonEMRNotebooks', 
                          'notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI)', 
                          'notebooks_GoogleCloudDatalab', 
                          'notebooks_DatabricksCollaborativeNotebooks', 
                          'notebooks_Zeppelin/ZeplNotebooks', 
                          'notebooks_DeepnoteNotebooks', 
                          'notebooks_ObservableNotebooks', 
                          'notebooks_None', 
                          'notebooks_Other', 
                          'sharing_PlotlyDash', 
                          'sharing_Streamlit', 
                          'sharing_NBViewer', 
                          'sharing_GitHub', 
                          'sharing_Personalblog', 
                          'sharing_Kaggle', 
                          'sharing_Colab', 
                          'sharing_Shiny', 
                          'sharing_does_not_share', 
                          'sharing_Other'], axis=1)

In [231]:
df_junto.head(1)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience
0,50-59,Man,India,Asia & Oceania,Other,None of these activities are an important par...,3-4,small,5-10 years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, R",Vim / Emacs,PostgreSQL,,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",Colab Notebooks,"GitHub , Kaggle",5-10 years,5+ years


In [232]:
df_junto.replace(r' ,', ',', regex=True)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience
0,50-59,Man,India,Asia & Oceania,Other,None of these activities are an important par...,3-4,small,5-10 years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, R",Vim / Emacs,PostgreSQL,,"Matplotlib,Seaborn,Ggplot / ggplot2,Shiny,Lea...",Colab Notebooks,"GitHub, Kaggle",5-10 years,5+ years
1,50-59,Man,Indonesia,Asia & Oceania,Program/Project Manager,Build and/or run the data infrastructure that...,1-2,small,20+ years,Senior,"Advanced statistical software (SPSS, SAS, etc.)",Python,"SQL, C, C++, Java","Notepad++,Jupyter Notebook",,,Matplotlib,"Kaggle Notebooks,Colab Notebooks",,< 1 year,< 2 years
2,18-29,Man,Pakistan,Asia & Oceania,Software Engineer,None of these activities are an important par...,0,small,1-3 years,Junior,"Basic statistical software (Microsoft Excel, G...",Python,"Python, C++, Java","PyCharm,Jupyter Notebook, Other","MySQL, MongoDB",,Matplotlib,Kaggle Notebooks,I do not share my work publicly,I do not use machine learning methods,
3,40-49,Man,Mexico,America,Research Scientist,Do research that advances the state of the ar...,0,small,20+ years,Senior,"Local development environments (RStudio, Jupyt...",Python,Python,"Spyder,Jupyter Notebook",,,Matplotlib,Colab Notebooks,,5-10 years,5+ years
4,40-49,Man,India,Asia & Oceania,Other,Analyze and understand data to influence produ...,5-9,medium,< 1 years,Junior,"Local development environments (RStudio, Jupyt...",Python,"Python, C, MATLAB","Spyder,MATLAB,Jupyter Notebook",,Microsoft Power BI,"Matplotlib,Seaborn,Ggplot / ggplot2",Google Cloud Datalab,I do not share my work publicly,10-20 years,5+ years
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25968,30-39,Man,Egypt,Africa,Data Analyst,Analyze and understand data to influence produ...,20+,large,1-3 years,Junior,"Business intelligence software (Salesforce, Ta...",Python,"Python, SQL, Javascript, Bash","Jupyter (JupyterLab; Jupyter Notebooks; etc),V...","PostgreSQL, SQLite, MongoDB, Microsoft SQL Se...",Microsoft Power BI,"Matplotlib,Seaborn","Colab Notebooks, Amazon Sagemaker Studio Noteb...","GitHub, Colab",1-2 years,< 2 years
25969,18-29,Man,China,Asia & Oceania,Student,,,,1-3 years,Junior,,Python,Python,PyCharm,,,,,,1-2 years,< 2 years
25970,50-59,Man,Sweden,Europe,Research Scientist,None of these activities are an important par...,20+,large,I have never written code,Sin experiencia,"Basic statistical software (Microsoft Excel, G...",,,,,,,,,,
25971,40-49,Man,United States of America,America,Data Scientist,,20+,large,5-10 years,Senior,,Python,"Python, SQL","Notepad++,Jupyter Notebook",,,"Matplotlib,Seaborn",,,4-5 years,2-5 years


In [233]:
df_junto.to_csv('../datos/datos_limpios_juntados.csv')

## Dividimos las columnas multirespuestas

In [234]:
df_divididos = pd.read_csv('../datos/datos_limpios.csv', index_col=0)

In [235]:
#esta función crea un dicionario con el número de la columna como keys y una lista de los valores únicos de la columna como values
def dict_respuestas(dataframe):
    
    dic_respuestas = {}
    
    for col in dataframe.columns:
        lista_nueva = []
        #ignoramos la columna index
        if col == "index":
            pass
        #comprobamos si los valores únicos son más de 20 - si es una pregunta multirespuesta 
        elif len(dataframe[col].unique()) > 22:
            #crea una lista de los valores únicos
                lista_unicos = list(dataframe[col].unique())
                for sublist in lista_unicos:
                    #cada lista dentro de los valores únicos se divide por las comas 
                    try:
                        lista_nueva.extend(sublist.split(","))
                    except:
                        pass
                    #lista de todas las posibles respuestas individuales
                    #la convertimos en set para eliminar los duplicados 
                    set_unicos = set(lista_nueva)
                    #lo volvemos a convertir en lista para poder usarla cómodamente
                    lista_nueva = list(set_unicos)
        else:
            #si no es multirespuesta podemos usar el unique para ver los valores únicos 
            lista_nueva = list(dataframe[col].unique())
        #list comprehension para quitar los nulos    
        lista_sin_nan = [item for item in lista_nueva if not (pd.isnull(item)) == True]
        #metemos la lista sin nulos en un diccionario donde el key es el número de la columna y los values con la lista de valores únicos lista_sin_nan
        dic_respuestas[col] = lista_sin_nan

    return dic_respuestas


In [236]:
diccionario_respuestas = dict_respuestas(df_divididos)

In [237]:
lista_columnas_dividir = ['dev_language', 'IDE', 'visualisation', 'work_activities', 'big_data', 'BI_tools']

In [238]:
def buscar(columna, string):
    try:
        if string in columna:
            return 1
        else:
            return 0
    except:
        return np.nan

In [239]:
def dividir_columnas(dataframe, lista_columnas, **dicc_respuestas):
    for col in lista_columnas:
        lista_nombres = []
        lista_respuestas = dicc_respuestas[col]
        for elemento in lista_respuestas:
            nombre_columna = f"{col}_{elemento.strip()}"
            lista_nombres.append(nombre_columna)
            dataframe[nombre_columna] = dataframe.apply(lambda df: buscar(df[col], elemento), axis=1)

In [240]:
dividir_columnas(df_divididos, lista_columnas_dividir, **diccionario_respuestas)

In [241]:
df_divididos.sample()

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other,dev_language_Java,dev_language_C,dev_language_Other,dev_language_R,dev_language_Julia,dev_language_C++,dev_language_MATLAB,dev_language_Swift,dev_language_Bash,dev_language_Javascript,dev_language_None,dev_language_Python,dev_language_SQL,IDE_MATLAB,IDE_RStudio,IDE_Other,IDE_None,IDE_Spyder,IDE_Visual Studio,IDE_Jupyter (JupyterLab; Jupyter Notebooks; etc),IDE_Notepad++,IDE_PyCharm,IDE_Vim / Emacs,IDE_Visual Studio Code (VSCode),IDE_Sublime Text,IDE_Jupyter Notebook,visualisation_Altair,visualisation_Ggplot / ggplot2,visualisation_Plotly / Plotly Express,visualisation_Leaflet / Folium,visualisation_Other,visualisation_Shiny,visualisation_Bokeh,visualisation_D3 js,visualisation_Matplotlib,visualisation_Geoplotlib,visualisation_None,visualisation_Seaborn,work_activities_Other,work_activities_Experimentation and iteration to improve existing ML models,work_activities_Build and/or run a machine learning service that operationally improves my product or workflows,work_activities_Do research that advances the state of the art of machine learning,work_activities_None of these activities are an important part of my role at work,work_activities_Build and/or run the data infrastructure that my business uses for storing; analyzing; and operationalizing data,work_activities_Analyze and understand data to influence product or business decisions,work_activities_Build prototypes to explore applying machine learning to new areas,big_data_Other,big_data_Google Cloud SQL,big_data_MongoDB,big_data_Microsoft SQL Server,big_data_Microsoft Azure Cosmos DB,big_data_Amazon Redshift,big_data_MySQL,big_data_Amazon RDS,big_data_Snowflake,big_data_Microsoft Azure SQL Database,big_data_Google Cloud Firestore,big_data_Google Cloud BigTable,big_data_Google Cloud BigQuery,big_data_IBM Db2,big_data_Google Cloud Spanner,big_data_Oracle Database,big_data_None,big_data_SQLite,big_data_Amazon DynamoDB,big_data_PostgreSQL,big_data_Amazon Aurora,BI_tools_TIBCO Spotfire,BI_tools_Alteryx,BI_tools_Thoughtspot,BI_tools_Other,BI_tools_Sisense,BI_tools_Salesforce,BI_tools_Microsoft Power BI,BI_tools_Domo,BI_tools_SAP Analytics Cloud,BI_tools_Tableau,BI_tools_Tableau CRM,BI_tools_Looker,BI_tools_Google Data Studio,BI_tools_Microsoft Azure Synapse,BI_tools_Qlik,BI_tools_None,BI_tools_Amazon QuickSight
4163,18-29,Man,Spain,Europe,Student,,,,< 1 years,Junior,"Basic statistical software (Microsoft Excel, G...",Python,Python,"Spyder,Jupyter Notebook",,,Matplotlib,,,< 1 year,< 2 years,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [242]:
lista_activities_col = []
for col in df_divididos.columns:
    if 'work_activities' in col:
        lista_activities_col.append(col)
lista_activities_col

['work_activities',
 'work_activities_Other',
 'work_activities_Experimentation and iteration to improve existing ML models',
 'work_activities_Build and/or run a machine learning service that operationally improves my product or workflows',
 'work_activities_Do research that advances the state of the art of machine learning',
 'work_activities_None of these activities are an important part of my role at work',
 'work_activities_Build and/or run the data infrastructure that my business uses for storing; analyzing; and operationalizing data',
 'work_activities_Analyze and understand data to influence product or business decisions',
 'work_activities_Build prototypes to explore applying machine learning to new areas']

In [243]:
diccionario_nombres_activities = {'work_activities_Analyze and understand data to influence product or business decisions':'activities_analyze_data',
                                'work_activities_Experimentation and iteration to improve existing ML models':'activities_improve_ML',
                                'work_activities_Build prototypes to explore applying machine learning to new areas':'activities_ML_prototypes',
                                'work_activities_None of these activities are an important part of my role at work':'activities_None',
                                'work_activities_Build and/or run the data infrastructure that my business uses for storing; analyzing; and operationalizing data':'activities_data_infrastructure',
                                'work_activities_Build and/or run a machine learning service that operationally improves my product or workflows':'activities_run_ML',
                                'work_activities_Other':'activities_Other',
                                'work_activities_Do research that advances the state of the art of machine learning':'activities_ML_research'}

In [244]:
df_divididos.rename(columns = diccionario_nombres_activities, inplace=True)

In [245]:
df_divididos.to_csv('../datos/datos_limpios_divididos.csv')

## Reemplazamos valores por 1.0s

In [246]:
df_unos = pd.read_csv('../datos/datos_limpios_divididos.csv', index_col=0)

In [247]:
df_unos.head(2)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other,dev_language_Java,dev_language_C,dev_language_Other,dev_language_R,dev_language_Julia,dev_language_C++,dev_language_MATLAB,dev_language_Swift,dev_language_Bash,dev_language_Javascript,dev_language_None,dev_language_Python,dev_language_SQL,IDE_MATLAB,IDE_RStudio,IDE_Other,IDE_None,IDE_Spyder,IDE_Visual Studio,IDE_Jupyter (JupyterLab; Jupyter Notebooks; etc),IDE_Notepad++,IDE_PyCharm,IDE_Vim / Emacs,IDE_Visual Studio Code (VSCode),IDE_Sublime Text,IDE_Jupyter Notebook,visualisation_Altair,visualisation_Ggplot / ggplot2,visualisation_Plotly / Plotly Express,visualisation_Leaflet / Folium,visualisation_Other,visualisation_Shiny,visualisation_Bokeh,visualisation_D3 js,visualisation_Matplotlib,visualisation_Geoplotlib,visualisation_None,visualisation_Seaborn,activities_Other,activities_improve_ML,activities_run_ML,activities_ML_research,activities_None,activities_data_infrastructure,activities_analyze_data,activities_ML_prototypes,big_data_Other,big_data_Google Cloud SQL,big_data_MongoDB,big_data_Microsoft SQL Server,big_data_Microsoft Azure Cosmos DB,big_data_Amazon Redshift,big_data_MySQL,big_data_Amazon RDS,big_data_Snowflake,big_data_Microsoft Azure SQL Database,big_data_Google Cloud Firestore,big_data_Google Cloud BigTable,big_data_Google Cloud BigQuery,big_data_IBM Db2,big_data_Google Cloud Spanner,big_data_Oracle Database,big_data_None,big_data_SQLite,big_data_Amazon DynamoDB,big_data_PostgreSQL,big_data_Amazon Aurora,BI_tools_TIBCO Spotfire,BI_tools_Alteryx,BI_tools_Thoughtspot,BI_tools_Other,BI_tools_Sisense,BI_tools_Salesforce,BI_tools_Microsoft Power BI,BI_tools_Domo,BI_tools_SAP Analytics Cloud,BI_tools_Tableau,BI_tools_Tableau CRM,BI_tools_Looker,BI_tools_Google Data Studio,BI_tools_Microsoft Azure Synapse,BI_tools_Qlik,BI_tools_None,BI_tools_Amazon QuickSight
0,50-59,Man,India,Asia & Oceania,Other,None of these activities are an important par...,3-4,small,5-10 years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, R",Vim / Emacs,PostgreSQL,,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",Colab Notebooks,"GitHub , Kaggle",5-10 years,5+ years,,Colab Notebooks,,,,,,,,,,,,,,,,,,,GitHub,,Kaggle,,,,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,50-59,Man,Indonesia,Asia & Oceania,Program/Project Manager,Build and/or run the data infrastructure that...,1-2,small,20+ years,Senior,"Advanced statistical software (SPSS, SAS, etc.)",Python,"SQL, C, C++, Java","Notepad++,Jupyter Notebook",,,Matplotlib,"Kaggle Notebooks,Colab Notebooks",,< 1 year,< 2 years,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [248]:
def strip_col(columna):
    try:
        return columna.strip()
    except:
        return columna

In [249]:
for key, value in diccionario_respuestas.items():
    if 'sharing_' in key or 'notebooks_' in key:
        diccionario_respuestas[key] = value[0].strip()

In [250]:
for key, value in diccionario_respuestas.items():
    if 'sharing_' in key or 'notebooks_' in key:
        df_unos[key] = df_unos[key].apply(strip_col)
        df_unos[key].replace(to_replace = value, value = 1.0, inplace=True)

In [251]:
df_unos['notebooks_KaggleNotebooks'].unique()

array([nan,  1.])

In [252]:
df_unos.head(2)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other,dev_language_Java,dev_language_C,dev_language_Other,dev_language_R,dev_language_Julia,dev_language_C++,dev_language_MATLAB,dev_language_Swift,dev_language_Bash,dev_language_Javascript,dev_language_None,dev_language_Python,dev_language_SQL,IDE_MATLAB,IDE_RStudio,IDE_Other,IDE_None,IDE_Spyder,IDE_Visual Studio,IDE_Jupyter (JupyterLab; Jupyter Notebooks; etc),IDE_Notepad++,IDE_PyCharm,IDE_Vim / Emacs,IDE_Visual Studio Code (VSCode),IDE_Sublime Text,IDE_Jupyter Notebook,visualisation_Altair,visualisation_Ggplot / ggplot2,visualisation_Plotly / Plotly Express,visualisation_Leaflet / Folium,visualisation_Other,visualisation_Shiny,visualisation_Bokeh,visualisation_D3 js,visualisation_Matplotlib,visualisation_Geoplotlib,visualisation_None,visualisation_Seaborn,activities_Other,activities_improve_ML,activities_run_ML,activities_ML_research,activities_None,activities_data_infrastructure,activities_analyze_data,activities_ML_prototypes,big_data_Other,big_data_Google Cloud SQL,big_data_MongoDB,big_data_Microsoft SQL Server,big_data_Microsoft Azure Cosmos DB,big_data_Amazon Redshift,big_data_MySQL,big_data_Amazon RDS,big_data_Snowflake,big_data_Microsoft Azure SQL Database,big_data_Google Cloud Firestore,big_data_Google Cloud BigTable,big_data_Google Cloud BigQuery,big_data_IBM Db2,big_data_Google Cloud Spanner,big_data_Oracle Database,big_data_None,big_data_SQLite,big_data_Amazon DynamoDB,big_data_PostgreSQL,big_data_Amazon Aurora,BI_tools_TIBCO Spotfire,BI_tools_Alteryx,BI_tools_Thoughtspot,BI_tools_Other,BI_tools_Sisense,BI_tools_Salesforce,BI_tools_Microsoft Power BI,BI_tools_Domo,BI_tools_SAP Analytics Cloud,BI_tools_Tableau,BI_tools_Tableau CRM,BI_tools_Looker,BI_tools_Google Data Studio,BI_tools_Microsoft Azure Synapse,BI_tools_Qlik,BI_tools_None,BI_tools_Amazon QuickSight
0,50-59,Man,India,Asia & Oceania,Other,None of these activities are an important par...,3-4,small,5-10 years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, R",Vim / Emacs,PostgreSQL,,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",Colab Notebooks,"GitHub , Kaggle",5-10 years,5+ years,,1.0,,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,50-59,Man,Indonesia,Asia & Oceania,Program/Project Manager,Build and/or run the data infrastructure that...,1-2,small,20+ years,Senior,"Advanced statistical software (SPSS, SAS, etc.)",Python,"SQL, C, C++, Java","Notepad++,Jupyter Notebook",,,Matplotlib,"Kaggle Notebooks,Colab Notebooks",,< 1 year,< 2 years,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [253]:
df_unos.to_csv('../datos/datos_divididos_unos.csv')

## Reemplazamos 1.0s por valores

In [254]:
df_sin_unos = pd.read_csv('../datos/datos_limpios_divididos.csv', index_col=0)

In [255]:
lista_columnas_reemplazar = ['dev_language_', 'IDE_', 'visualisation_', 'activities_', 'big_data_', 'BI_tools_']

In [256]:
for key in diccionario_respuestas.keys():
    for i, x in enumerate(diccionario_respuestas[key]):
        if type(diccionario_respuestas[key]) == list:
            try:
                diccionario_respuestas[key][i] = diccionario_respuestas[key][i].strip()
            except:
                pass
        else:
            pass

In [257]:
for col in df_sin_unos.columns:
    for i in lista_columnas_reemplazar:
        if i in col:
            df_sin_unos[col] = df_sin_unos[col].astype('object')

In [258]:
def reemplazar_unos(col, string):
    if col == 1.0:
            return string
    else:
        return col

In [259]:
def reemplazar_prefix(col, string):
    try:
        return col.replace(string, "")
    except:
        return ""

In [260]:
test = reemplazar_unos(1.0, 'dev_language_R')
test

'dev_language_R'

In [261]:
for col in df_sin_unos.columns:
    for i in lista_columnas_reemplazar:
        if i in col:
            df_sin_unos[col] = df_sin_unos.apply(lambda x: reemplazar_unos(x[col], col), axis=1)


In [262]:
for col in df_sin_unos.columns:
    for i in lista_columnas_reemplazar:
        if i in col:
            df_sin_unos[col] = df_sin_unos.apply(lambda x: reemplazar_prefix(x[col], i), axis=1)

In [263]:
df_sin_unos.sample(5)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other,dev_language_Java,dev_language_C,dev_language_Other,dev_language_R,dev_language_Julia,dev_language_C++,dev_language_MATLAB,dev_language_Swift,dev_language_Bash,dev_language_Javascript,dev_language_None,dev_language_Python,dev_language_SQL,IDE_MATLAB,IDE_RStudio,IDE_Other,IDE_None,IDE_Spyder,IDE_Visual Studio,IDE_Jupyter (JupyterLab; Jupyter Notebooks; etc),IDE_Notepad++,IDE_PyCharm,IDE_Vim / Emacs,IDE_Visual Studio Code (VSCode),IDE_Sublime Text,IDE_Jupyter Notebook,visualisation_Altair,visualisation_Ggplot / ggplot2,visualisation_Plotly / Plotly Express,visualisation_Leaflet / Folium,visualisation_Other,visualisation_Shiny,visualisation_Bokeh,visualisation_D3 js,visualisation_Matplotlib,visualisation_Geoplotlib,visualisation_None,visualisation_Seaborn,activities_Other,activities_improve_ML,activities_run_ML,activities_ML_research,activities_None,activities_data_infrastructure,activities_analyze_data,activities_ML_prototypes,big_data_Other,big_data_Google Cloud SQL,big_data_MongoDB,big_data_Microsoft SQL Server,big_data_Microsoft Azure Cosmos DB,big_data_Amazon Redshift,big_data_MySQL,big_data_Amazon RDS,big_data_Snowflake,big_data_Microsoft Azure SQL Database,big_data_Google Cloud Firestore,big_data_Google Cloud BigTable,big_data_Google Cloud BigQuery,big_data_IBM Db2,big_data_Google Cloud Spanner,big_data_Oracle Database,big_data_None,big_data_SQLite,big_data_Amazon DynamoDB,big_data_PostgreSQL,big_data_Amazon Aurora,BI_tools_TIBCO Spotfire,BI_tools_Alteryx,BI_tools_Thoughtspot,BI_tools_Other,BI_tools_Sisense,BI_tools_Salesforce,BI_tools_Microsoft Power BI,BI_tools_Domo,BI_tools_SAP Analytics Cloud,BI_tools_Tableau,BI_tools_Tableau CRM,BI_tools_Looker,BI_tools_Google Data Studio,BI_tools_Microsoft Azure Synapse,BI_tools_Qlik,BI_tools_None,BI_tools_Amazon QuickSight
13290,18-29,Woman,India,Asia & Oceania,Student,,,,< 1 years,Junior,,Python,C,Jupyter (JupyterLab; Jupyter Notebooks; etc),,,Matplotlib,,,< 1 year,< 2 years,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C,,,,,,,,,,,,,,,,,,Jupyter (JupyterLab; Jupyter Notebooks; etc),,,,,,Jupyter Notebook,,,,,,,,,Matplotlib,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
23502,40-49,Man,Taiwan,Asia & Oceania,Software Engineer,Analyze and understand data to influence produ...,1-2,small,5-10 years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, SQL, C, C++, Java, Javascript, Bash","Notepad++,Jupyter Notebook",MySQL,"Tableau, Qlik","Matplotlib ,Ggplot / ggplot2 ,Geoplotlib","Colab Notebooks,Google Cloud Notebooks (AI Pla...",GitHub,1-2 years,< 2 years,,Colab Notebooks,,,,,,,,Google Cloud Notebooks (AI Platform / Vertex AI),,,,,,,,,,,GitHub,,,,,,,Java,C,,,,C++,,,Bash,Javascript,,Python,SQL,,,,,,,,Notepad++,,,,,Jupyter Notebook,,Ggplot / ggplot2,,,,,,,Matplotlib,Geoplotlib,,,,improve_ML,run_ML,ML_research,,data_infrastructure,analyze_data,ML_prototypes,,,,,,,MySQL,,,,,,,,,,,,,,,,,,,,,,,,Tableau,,,,,Qlik,,
19744,18-29,Man,South Korea,Asia & Oceania,Student,,,,< 1 years,Junior,,Python,"Python, C++, Javascript","Visual Studio Code (VSCode) ,PyCharm ,Jupyter ...",,,"Matplotlib ,Seaborn","Kaggle Notebooks,Colab Notebooks",,< 1 year,< 2 years,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,,Java,C,,,,C++,,,,Javascript,,Python,,,,,,,Visual Studio,,,PyCharm,,Visual Studio Code (VSCode),,Jupyter Notebook,,,,,,,,,Matplotlib,,,Seaborn,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
670,50-59,Man,Other,Other,Data Scientist,Analyze and understand data to influence produ...,1-2,small,10-20 years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, R","Jupyter (JupyterLab; Jupyter Notebooks; etc) ,...",SQLite,Tableau,"Matplotlib ,Seaborn","Colab Notebooks, IBM Watson Studio , Amazon EM...",GitHub,1-2 years,< 2 years,,Colab Notebooks,,,,,IBM Watson Studio,,Amazon EMR Notebooks,,,,,,,,,,,,GitHub,,,,,,,,,,R,,,,,,,,Python,,,RStudio,,,,Visual Studio,Jupyter (JupyterLab; Jupyter Notebooks; etc),,,,Visual Studio Code (VSCode),,Jupyter Notebook,,,,,,,,,Matplotlib,,,Seaborn,,,,,,,analyze_data,ML_prototypes,,,,,,,,,,,,,,,,,,SQLite,,,,,,,,,,,,,Tableau,,,,,,,
2428,18-29,Man,India,Asia & Oceania,Student,,,,1-3 years,Junior,"Cloud-based data software & APIs (AWS, GCP, Az...",Python,"SQL, C, C++","Jupyter (JupyterLab; Jupyter Notebooks; etc) ,...",,,,"Kaggle Notebooks, Binder / JupyterHub ,Google...",,< 1 year,< 2 years,Kaggle Notebooks,,,,Binder / JupyterHub,,,,,Google Cloud Notebooks (AI Platform / Vertex AI),Google Cloud Datalab,,,,,,,,,,,,,,,,,,C,,,,C++,,,,,,,SQL,,,,,,Visual Studio,Jupyter (JupyterLab; Jupyter Notebooks; etc),,PyCharm,,Visual Studio Code (VSCode),,Jupyter Notebook,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [264]:
diccionario_respuestas2 = dict_respuestas(df_sin_unos)

In [265]:
for key in diccionario_respuestas2.keys():
    for i, x in enumerate(diccionario_respuestas2[key]):
        if type(diccionario_respuestas2[key]) == list:
            try:
                diccionario_respuestas2[key][i] = diccionario_respuestas2[key][i].strip()
            except:
                pass
        else:
            pass

In [266]:
with open ('../datos/diccionario_respuestas2.pkl', 'wb') as f:
    pickle.dump(diccionario_respuestas2, f)

In [267]:
df_sin_unos.to_csv('../datos/datos_sin_unos.csv')

## Filtramos por Data Analysts

In [268]:
df_limpio = pd.read_csv('../datos/datos_limpios_divididos.csv', index_col=0)

In [269]:
# filtramos el nuevo dataframe unido por respuestas de personas que se identifican como 
# Business Analyst o Data Analyst
df_da_ba = df_limpio[(df_limpio ["job_title"] == "Business Analyst") | (df_limpio ["job_title"] == "Data Analyst")]

In [270]:
df_da_ba.head(3)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other,dev_language_Java,dev_language_C,dev_language_Other,dev_language_R,dev_language_Julia,dev_language_C++,dev_language_MATLAB,dev_language_Swift,dev_language_Bash,dev_language_Javascript,dev_language_None,dev_language_Python,dev_language_SQL,IDE_MATLAB,IDE_RStudio,IDE_Other,IDE_None,IDE_Spyder,IDE_Visual Studio,IDE_Jupyter (JupyterLab; Jupyter Notebooks; etc),IDE_Notepad++,IDE_PyCharm,IDE_Vim / Emacs,IDE_Visual Studio Code (VSCode),IDE_Sublime Text,IDE_Jupyter Notebook,visualisation_Altair,visualisation_Ggplot / ggplot2,visualisation_Plotly / Plotly Express,visualisation_Leaflet / Folium,visualisation_Other,visualisation_Shiny,visualisation_Bokeh,visualisation_D3 js,visualisation_Matplotlib,visualisation_Geoplotlib,visualisation_None,visualisation_Seaborn,activities_Other,activities_improve_ML,activities_run_ML,activities_ML_research,activities_None,activities_data_infrastructure,activities_analyze_data,activities_ML_prototypes,big_data_Other,big_data_Google Cloud SQL,big_data_MongoDB,big_data_Microsoft SQL Server,big_data_Microsoft Azure Cosmos DB,big_data_Amazon Redshift,big_data_MySQL,big_data_Amazon RDS,big_data_Snowflake,big_data_Microsoft Azure SQL Database,big_data_Google Cloud Firestore,big_data_Google Cloud BigTable,big_data_Google Cloud BigQuery,big_data_IBM Db2,big_data_Google Cloud Spanner,big_data_Oracle Database,big_data_None,big_data_SQLite,big_data_Amazon DynamoDB,big_data_PostgreSQL,big_data_Amazon Aurora,BI_tools_TIBCO Spotfire,BI_tools_Alteryx,BI_tools_Thoughtspot,BI_tools_Other,BI_tools_Sisense,BI_tools_Salesforce,BI_tools_Microsoft Power BI,BI_tools_Domo,BI_tools_SAP Analytics Cloud,BI_tools_Tableau,BI_tools_Tableau CRM,BI_tools_Looker,BI_tools_Google Data Studio,BI_tools_Microsoft Azure Synapse,BI_tools_Qlik,BI_tools_None,BI_tools_Amazon QuickSight
16,50-59,Man,Belgium,Europe,Data Analyst,Analyze and understand data to influence produ...,5-9,medium,20+ years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, SQL","Jupyter (JupyterLab; Jupyter Notebooks; etc) ,...",,,"Matplotlib ,Seaborn ,Plotly / Plotly Express","Kaggle Notebooks,Colab Notebooks, Databricks ...",,1-2 years,< 2 years,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,Databricks Collaborative Notebooks,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32,18-29,Nonbinary,United States of America,America,Data Analyst,,3-4,small,< 1 years,Junior,,R,R,RStudio,,,Ggplot / ggplot2,Google Cloud Datalab,,I do not use machine learning methods,,,,,,,,,,,,Google Cloud Datalab,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
33,30-39,Woman,Egypt,Africa,Data Analyst,Analyze and understand data to influence produ...,0,small,3-5 years,Senior,"Basic statistical software (Microsoft Excel, G...",R,Python,"Notepad++,Jupyter Notebook",,,,"Kaggle Notebooks,Colab Notebooks",I do not share my work publicly,I do not use machine learning methods,,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,,,,,,,,,,,,I do not share my work publicly,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [271]:
# guardamos los datos unidos con las columnas eliminadas en un archivo de csv
df_da_ba.to_csv('../datos/datos_da_ba.csv')