In [1]:
import pandas as pd
import sys
sys.path.append('../')
from src import soporte as sp
import soporte_variables as sv

In [2]:
pd.options.display.max_columns=None
pd.options.display.max_rows=100

In [3]:
# cargamos el dataframe de los datos unidos

df_unido = pd.read_csv('../datos/datos_unidos_raw.csv', index_col = 0)

## Limpieza

In [4]:
# creamos la instancia de la clase de limpieza

lm = sp.Limpieza(df_unido)

In [5]:
# limpiamos los espacios delantes de comas en la columna Q24 

df_unido['Q24'] = df_unido['Q24'].apply(lm.limpiar_espacios)

In [6]:
# reemplazamos las comas delantes de las a's en la columna Q24 por ;'s, 
# ya que nos va a permitir separar las columnas mas en adelante

df_unido['Q24'] = df_unido['Q24'].apply(lm.comas_as)

In [7]:
# reemplazamos las comas dentro de parentesis en la columna Q9 por ;'s, 
# ya que nos va a permitir separar las columnas mas en adelante

df_unido['Q9'] = df_unido['Q9'].apply(lm.comas_parentesis)

## Limpieza: Eliminamos columnas

In [8]:
# eliminamos las columnas de las variables que hemos decidido que no nos interesan para esta analisis

lm.eliminar_columnas(df_unido, *['level_0', 'index', 'time', 'Q4', 'Q11', 'Q12', 'Q13', 'Q16', 'Q17', 'Q20', 'Q21'])

Se han eliminado las columnas indicadas. Ahora las columnas del dataframe son Index(['age', 'gender', 'Q3', 'Q5', 'Q6', 'Q8', 'Q15', 'Q22', 'Q23', 'Q25',
       ...
       'Q38_B_Part_3', 'Q38_B_Part_4', 'Q38_B_Part_5', 'Q38_B_Part_6',
       'Q38_B_Part_7', 'Q38_B_Part_8', 'Q38_B_Part_9', 'Q38_B_Part_10',
       'Q38_B_Part_11', 'Q38_B_OTHER'],
      dtype='object', length=249)


In [9]:
lm.eliminar_rango_col(df_unido, 'Q23', 'Q35')

Se han eliminado las columnas indicadas. Ahora las columnas del dataframe son: Index(['age', 'gender', 'Q3', 'Q5', 'Q6', 'Q8', 'Q15', 'Q22', 'Q41', 'Q7',
       ...
       'Q38_B_Part_3', 'Q38_B_Part_4', 'Q38_B_Part_5', 'Q38_B_Part_6',
       'Q38_B_Part_7', 'Q38_B_Part_8', 'Q38_B_Part_9', 'Q38_B_Part_10',
       'Q38_B_Part_11', 'Q38_B_OTHER'],
      dtype='object', length=244)


In [10]:
lm.eliminar_rango_col(df_unido, 'Q18_Part_1', 'Q38_A_OTHER')

Se han eliminado las columnas indicadas. Ahora las columnas del dataframe son: Index(['age', 'gender', 'Q3', 'Q5', 'Q6', 'Q8', 'Q15', 'Q22', 'Q41', 'Q7',
       ...
       'Q38_B_Part_3', 'Q38_B_Part_4', 'Q38_B_Part_5', 'Q38_B_Part_6',
       'Q38_B_Part_7', 'Q38_B_Part_8', 'Q38_B_Part_9', 'Q38_B_Part_10',
       'Q38_B_Part_11', 'Q38_B_OTHER'],
      dtype='object', length=167)


In [11]:
lm.eliminar_rango_col(df_unido, 'Q40_Part_1', 'Q38_B_OTHER')

Se han eliminado las columnas indicadas. Ahora las columnas del dataframe son: Index(['age', 'gender', 'Q3', 'Q5', 'Q6', 'Q8', 'Q15', 'Q22', 'Q41', 'Q7',
       'Q9', 'Q14', 'Q24', 'Q32', 'Q34', 'Q10_Part_1', 'Q10_Part_2',
       'Q10_Part_3', 'Q10_Part_4', 'Q10_Part_5', 'Q10_Part_6', 'Q10_Part_7',
       'Q10_Part_8', 'Q10_Part_9', 'Q10_Part_10', 'Q10_Part_11', 'Q10_Part_12',
       'Q10_Part_13', 'Q10_Part_14', 'Q10_Part_15', 'Q10_Part_16', 'Q10_OTHER',
       'Q39_Part_1', 'Q39_Part_2', 'Q39_Part_3', 'Q39_Part_4', 'Q39_Part_5',
       'Q39_Part_6', 'Q39_Part_7', 'Q39_Part_8', 'Q39_Part_9', 'Q39_OTHER'],
      dtype='object')


## Cambiar nombres de las columnas

In [12]:
# creamos una copia del dataframe para facilitar el siguiente codigo

df_limpio = df_unido.copy()

In [13]:
df_limpio.rename(columns=sv.diccionario_nombres, inplace=True)
df_limpio.head(2)

Unnamed: 0,age,gender,country,job_title,years_programming,first_language_rec,ML,size_DA_dept,primary_data_tool,dev_language,IDE,visualisation,work_activities,big_data,BI_tools,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other
0,50-54,Man,India,Other,5-10 years,Python,5-10 years,3-4,"Local development environments (RStudio, Jupyt...","Python, R",Vim / Emacs,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",None of these activities are an important par...,PostgreSQL,,,Colab Notebooks,,,,,,,,,,,,,,,,,,,GitHub,,Kaggle,,,,
1,50-54,Man,Indonesia,Program/Project Manager,20+ years,Python,< 1 year,1-2,"Advanced statistical software (SPSS, SAS, etc.)","SQL, C, C++, Java","Notepad++,Jupyter Notebook",Matplotlib,Build and/or run the data infrastructure that...,,,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,


## Creamos columnas de categorias

In [14]:
lm.col_categorias(df_limpio, **sv.dicc_columnas)

In [15]:
df_limpio.head(3)

Unnamed: 0,age,gender,country,job_title,years_programming,first_language_rec,ML,size_DA_dept,primary_data_tool,dev_language,IDE,visualisation,work_activities,big_data,BI_tools,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other,continent,programming_experience,dept_size,ML_experience
0,,Man,India,Other,5-10 years,Python,5-10 years,3-4,"Local development environments (RStudio, Jupyt...","Python, R",Vim / Emacs,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",None of these activities are an important par...,PostgreSQL,,,Colab Notebooks,,,,,,,,,,,,,,,,,,,GitHub,,Kaggle,,,,,,,,
1,,Man,Indonesia,Program/Project Manager,20+ years,Python,< 1 year,1-2,"Advanced statistical software (SPSS, SAS, etc.)","SQL, C, C++, Java","Notepad++,Jupyter Notebook",Matplotlib,Build and/or run the data infrastructure that...,,,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,Man,Pakistan,Software Engineer,1-3 years,Python,I do not use machine learning methods,0,"Basic statistical software (Microsoft Excel, G...","Python, C++, Java","PyCharm ,Jupyter Notebook, Other",Matplotlib,None of these activities are an important par...,"MySQL , MongoDB",,Kaggle Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,I do not share my work publicly,,,,,


## Reordenamos las columnas

In [16]:
df_limpio.columns

Index(['age', 'gender', 'country', 'job_title', 'years_programming',
       'first_language_rec', 'ML', 'size_DA_dept', 'primary_data_tool',
       'dev_language', 'IDE', 'visualisation', 'work_activities', 'big_data',
       'BI_tools', 'notebooks_KaggleNotebooks', 'notebooks_ColabNotebooks',
       'notebooks_AzureNotebooks', 'notebooks_Paperspace/Gradient',
       'notebooks_Binder/JupyterHub', 'notebooks_CodeOcean',
       'notebooks_IBMWatsonStudio', 'notebooks_AmazonSagemakerStudioNotebooks',
       'notebooks_AmazonEMRNotebooks',
       'notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI)',
       'notebooks_GoogleCloudDatalab',
       'notebooks_DatabricksCollaborativeNotebooks',
       'notebooks_Zeppelin/ZeplNotebooks', 'notebooks_DeepnoteNotebooks',
       'notebooks_ObservableNotebooks', 'notebooks_None', 'notebooks_Other',
       'sharing_PlotlyDash', 'sharing_Streamlit', 'sharing_NBViewer',
       'sharing_GitHub', 'sharing_Personalblog', 'sharing_Kaggle',
       'sharing

In [17]:
df_limpio = df_limpio.reindex(columns=sv.nuevo_orden)
df_limpio.head(1)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,first_language_rec,dev_language,primary_data_tool,IDE,big_data,BI_tools,visualisation,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other
0,,Man,India,,Other,None of these activities are an important par...,3-4,,5-10 years,,Python,"Python, R","Local development environments (RStudio, Jupyt...",Vim / Emacs,PostgreSQL,,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",5-10 years,,,Colab Notebooks,,,,,,,,,,,,,,,,,,,GitHub,,Kaggle,,,,


In [18]:
# df_limpio.to_csv('../datos/datos_limpios.csv')

# Juntamos las columnas partidas

In [19]:
df_limpio = pd.read_csv('../datos/datos_limpios.csv', index_col=0)

In [20]:
for i, col in enumerate(df_limpio.columns):
    print(i, col)

0 age
1 gender
2 country
3 continent
4 job_title
5 work_activities
6 size_DA_dept
7 dept size
8 years_programming
9 programming_experience
10 primary_data_tool
11 first_language_rec
12 dev_language
13 IDE
14 big_data
15 BI_tools
16 visualisation
17 notebooks
18 sharing
19 ML
20 ML_experience
21 notebooks_KaggleNotebooks
22 notebooks_ColabNotebooks
23 notebooks_AzureNotebooks
24 notebooks_Paperspace/Gradient
25 notebooks_Binder/JupyterHub
26 notebooks_CodeOcean
27 notebooks_IBMWatsonStudio
28 notebooks_AmazonSagemakerStudioNotebooks
29 notebooks_AmazonEMRNotebooks
30 notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI)
31 notebooks_GoogleCloudDatalab
32 notebooks_DatabricksCollaborativeNotebooks
33 notebooks_Zeppelin/ZeplNotebooks
34 notebooks_DeepnoteNotebooks
35 notebooks_ObservableNotebooks
36 notebooks_None
37 notebooks_Other
38 sharing_PlotlyDash
39 sharing_Streamlit
40 sharing_NBViewer
41 sharing_GitHub
42 sharing_Personalblog
43 sharing_Kaggle
44 sharing_Colab
45 sharing_Shiny
46 

In [21]:
lm.juntar_columnas(df_limpio, 'notebooks_KaggleNotebooks', 'notebooks_Other', 'notebooks')

In [22]:
lm.juntar_columnas(df_limpio, 'sharing_PlotlyDash', 'sharing_Other', 'sharing')

In [23]:
df_limpio = df_limpio.reindex(columns=sv.nuevo_orden2)
df_limpio.head(1)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other
0,50-59,Man,India,Asia & Oceania,Other,None of these activities are an important par...,3-4,small,5-10 years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, R",Vim / Emacs,PostgreSQL,,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",Colab Notebooks,"GitHub , Kaggle",5-10 years,5+ years,,Colab Notebooks,,,,,,,,,,,,,,,,,,,GitHub,,Kaggle,,,,


In [24]:
df_limpio = df_limpio.replace(r' ,', ',', regex=True)
df_limpio.head(2)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other
0,50-59,Man,India,Asia & Oceania,Other,None of these activities are an important par...,3-4,small,5-10 years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, R",Vim / Emacs,PostgreSQL,,"Matplotlib,Seaborn,Ggplot / ggplot2,Shiny,Lea...",Colab Notebooks,"GitHub, Kaggle",5-10 years,5+ years,,Colab Notebooks,,,,,,,,,,,,,,,,,,,GitHub,,Kaggle,,,,
1,50-59,Man,Indonesia,Asia & Oceania,Program/Project Manager,Build and/or run the data infrastructure that...,1-2,small,20+ years,Senior,"Advanced statistical software (SPSS, SAS, etc.)",Python,"SQL, C, C++, Java","Notepad++,Jupyter Notebook",,,Matplotlib,"Kaggle Notebooks,Colab Notebooks",,< 1 year,< 2 years,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,


In [25]:
# df_limpio.to_csv('../datos/datos_limpios.csv')

In [26]:
lm.eliminar_rango_col(df_limpio, 'notebooks_KaggleNotebooks', 'notebooks_Other')

Se han eliminado las columnas indicadas. Ahora las columnas del dataframe son: Index(['age', 'gender', 'country', 'continent', 'job_title', 'work_activities',
       'size_DA_dept', 'dept size', 'years_programming',
       'programming_experience', 'primary_data_tool', 'first_language_rec',
       'dev_language', 'IDE', 'big_data', 'BI_tools', 'visualisation',
       'notebooks', 'sharing', 'ML', 'ML_experience', 'sharing_PlotlyDash',
       'sharing_Streamlit', 'sharing_NBViewer', 'sharing_GitHub',
       'sharing_Personalblog', 'sharing_Kaggle', 'sharing_Colab',
       'sharing_Shiny', 'sharing_does_not_share', 'sharing_Other'],
      dtype='object')


In [27]:
lm.eliminar_rango_col(df_limpio, 'sharing_PlotlyDash', 'sharing_Other')

Se han eliminado las columnas indicadas. Ahora las columnas del dataframe son: Index(['age', 'gender', 'country', 'continent', 'job_title', 'work_activities',
       'size_DA_dept', 'dept size', 'years_programming',
       'programming_experience', 'primary_data_tool', 'first_language_rec',
       'dev_language', 'IDE', 'big_data', 'BI_tools', 'visualisation',
       'notebooks', 'sharing', 'ML', 'ML_experience'],
      dtype='object')


In [28]:
# df_limpio.to_csv('../datos/datos_limpios_juntados.csv')

## Dividimos las columnas multirespuestas

In [52]:
df_divididos = pd.read_csv('../datos/datos_limpios.csv', index_col=0)

In [30]:
ex = sp.Exploracion(df_limpio)

In [31]:
diccionario_respuestas = ex.dict_respuestas()

In [53]:
lm.dividir_columnas(df_divididos, sv.lista_columnas_dividir, **diccionario_respuestas)

In [54]:
df_divididos.sample()

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other,dev_language_Javascript,dev_language_SQL,dev_language_Swift,dev_language_Other,dev_language_R,dev_language_MATLAB,dev_language_C,dev_language_C++,dev_language_Bash,dev_language_Python,dev_language_Java,dev_language_None,dev_language_Julia,IDE_RStudio,IDE_Vim / Emacs,IDE_Spyder,IDE_Jupyter (JupyterLab; Jupyter Notebooks; etc),IDE_MATLAB,IDE_Jupyter Notebook,IDE_Visual Studio Code (VSCode),IDE_Notepad++,IDE_Visual Studio,IDE_PyCharm,IDE_None,IDE_Other,IDE_Sublime Text,visualisation_Leaflet / Folium,visualisation_Matplotlib,visualisation_Shiny,visualisation_D3 js,visualisation_Bokeh,visualisation_Seaborn,visualisation_Ggplot / ggplot2,visualisation_Plotly / Plotly Express,visualisation_Altair,visualisation_None,visualisation_Other,visualisation_Geoplotlib,work_activities_Do research that advances the state of the art of machine learning,work_activities_None of these activities are an important part of my role at work,work_activities_Other,work_activities_Experimentation and iteration to improve existing ML models,work_activities_Build prototypes to explore applying machine learning to new areas,work_activities_Analyze and understand data to influence product or business decisions,work_activities_Build and/or run a machine learning service that operationally improves my product or workflows,work_activities_Build and/or run the data infrastructure that my business uses for storing; analyzing; and operationalizing data,big_data_Google Cloud Firestore,big_data_Amazon DynamoDB,big_data_Oracle Database,big_data_Snowflake,big_data_Microsoft Azure SQL Database,big_data_Amazon RDS,big_data_MySQL,big_data_Google Cloud BigTable,big_data_Amazon Redshift,big_data_PostgreSQL,big_data_MongoDB,big_data_Google Cloud SQL,big_data_Google Cloud BigQuery,big_data_Microsoft Azure Cosmos DB,big_data_Amazon Aurora,big_data_Microsoft SQL Server,big_data_None,big_data_SQLite,big_data_IBM Db2,big_data_Google Cloud Spanner,big_data_Other,BI_tools_Salesforce,BI_tools_Tableau CRM,BI_tools_SAP Analytics Cloud,BI_tools_Sisense,BI_tools_Tableau,BI_tools_TIBCO Spotfire,BI_tools_Microsoft Azure Synapse,BI_tools_Microsoft Power BI,BI_tools_Looker,BI_tools_Domo,BI_tools_Amazon QuickSight,BI_tools_Alteryx,BI_tools_Thoughtspot,BI_tools_None,BI_tools_Other,BI_tools_Google Data Studio,BI_tools_Qlik
8855,30-39,Man,India,Asia & Oceania,DBA/Database Engineer,,,,5-10 years,Senior,,SQL,"Python, SQL, Javascript, Julia, Bash","Jupyter (JupyterLab; Jupyter Notebooks; etc) ,...",,,,"Kaggle Notebooks,Colab Notebooks,Azure Notebo...",,,,Kaggle Notebooks,Colab Notebooks,Azure Notebooks,Paperspace / Gradient,Binder / JupyterHub,Code Ocean,,Amazon Sagemaker Studio Notebooks,Amazon EMR Notebooks,Google Cloud Notebooks (AI Platform / Vertex AI),Google Cloud Datalab,Databricks Collaborative Notebooks,Zeppelin / Zepl Notebooks,Deepnote Notebooks,Observable Notebooks,,,,,,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [55]:
df_divididos.rename(columns = sv.diccionario_nombres_activities, inplace=True)

In [35]:
df_unos = df_divididos.copy()

In [36]:
df_unos.head(3)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other,dev_language_Javascript,dev_language_SQL,dev_language_Swift,dev_language_Other,dev_language_R,dev_language_MATLAB,dev_language_C,dev_language_C++,dev_language_Bash,dev_language_Python,dev_language_Java,dev_language_None,dev_language_Julia,IDE_RStudio,IDE_Vim / Emacs,IDE_Spyder,IDE_Jupyter (JupyterLab; Jupyter Notebooks; etc),IDE_MATLAB,IDE_Jupyter Notebook,IDE_Visual Studio Code (VSCode),IDE_Notepad++,IDE_Visual Studio,IDE_PyCharm,IDE_None,IDE_Other,IDE_Sublime Text,visualisation_Leaflet / Folium,visualisation_Matplotlib,visualisation_Shiny,visualisation_D3 js,visualisation_Bokeh,visualisation_Seaborn,visualisation_Ggplot / ggplot2,visualisation_Plotly / Plotly Express,visualisation_Altair,visualisation_None,visualisation_Other,visualisation_Geoplotlib,activities_ML_research,activities_None,activities_Other,activities_improve_ML,activities_ML_prototypes,activities_analyze_data,activities_run_ML,activities_data_infrastructure,big_data_Google Cloud Firestore,big_data_Amazon DynamoDB,big_data_Oracle Database,big_data_Snowflake,big_data_Microsoft Azure SQL Database,big_data_Amazon RDS,big_data_MySQL,big_data_Google Cloud BigTable,big_data_Amazon Redshift,big_data_PostgreSQL,big_data_MongoDB,big_data_Google Cloud SQL,big_data_Google Cloud BigQuery,big_data_Microsoft Azure Cosmos DB,big_data_Amazon Aurora,big_data_Microsoft SQL Server,big_data_None,big_data_SQLite,big_data_IBM Db2,big_data_Google Cloud Spanner,big_data_Other,BI_tools_Salesforce,BI_tools_Tableau CRM,BI_tools_SAP Analytics Cloud,BI_tools_Sisense,BI_tools_Tableau,BI_tools_TIBCO Spotfire,BI_tools_Microsoft Azure Synapse,BI_tools_Microsoft Power BI,BI_tools_Looker,BI_tools_Domo,BI_tools_Amazon QuickSight,BI_tools_Alteryx,BI_tools_Thoughtspot,BI_tools_None,BI_tools_Other,BI_tools_Google Data Studio,BI_tools_Qlik
0,50-59,Man,India,Asia & Oceania,Other,None of these activities are an important par...,3-4,small,5-10 years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, R",Vim / Emacs,PostgreSQL,,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",Colab Notebooks,"GitHub , Kaggle",5-10 years,5+ years,,Colab Notebooks,,,,,,,,,,,,,,,,,,,GitHub,,Kaggle,,,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,50-59,Man,Indonesia,Asia & Oceania,Program/Project Manager,Build and/or run the data infrastructure that...,1-2,small,20+ years,Senior,"Advanced statistical software (SPSS, SAS, etc.)",Python,"SQL, C, C++, Java","Notepad++,Jupyter Notebook",,,Matplotlib,"Kaggle Notebooks,Colab Notebooks",,< 1 year,< 2 years,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,18-29,Man,Pakistan,Asia & Oceania,Software Engineer,None of these activities are an important par...,0,small,1-3 years,Junior,"Basic statistical software (Microsoft Excel, G...",Python,"Python, C++, Java","PyCharm ,Jupyter Notebook, Other","MySQL , MongoDB",,Matplotlib,Kaggle Notebooks,I do not share my work publicly,I do not use machine learning methods,,Kaggle Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,I do not share my work publicly,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [37]:
for col in df_unos.columns:
    for i in ['sharing_', 'notebooks_']:
        if i in col:
            for x in df_unos[col].unique():
                if type(x) == str:
                    df_unos[col] = df_unos.apply(lambda df: lm.unos_zeros(df[col], x.strip()), axis=1)

In [38]:
df_unos['notebooks_ColabNotebooks'].unique()

array([ 1., nan])

In [39]:
df_unos.head(2)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other,dev_language_Javascript,dev_language_SQL,dev_language_Swift,dev_language_Other,dev_language_R,dev_language_MATLAB,dev_language_C,dev_language_C++,dev_language_Bash,dev_language_Python,dev_language_Java,dev_language_None,dev_language_Julia,IDE_RStudio,IDE_Vim / Emacs,IDE_Spyder,IDE_Jupyter (JupyterLab; Jupyter Notebooks; etc),IDE_MATLAB,IDE_Jupyter Notebook,IDE_Visual Studio Code (VSCode),IDE_Notepad++,IDE_Visual Studio,IDE_PyCharm,IDE_None,IDE_Other,IDE_Sublime Text,visualisation_Leaflet / Folium,visualisation_Matplotlib,visualisation_Shiny,visualisation_D3 js,visualisation_Bokeh,visualisation_Seaborn,visualisation_Ggplot / ggplot2,visualisation_Plotly / Plotly Express,visualisation_Altair,visualisation_None,visualisation_Other,visualisation_Geoplotlib,activities_ML_research,activities_None,activities_Other,activities_improve_ML,activities_ML_prototypes,activities_analyze_data,activities_run_ML,activities_data_infrastructure,big_data_Google Cloud Firestore,big_data_Amazon DynamoDB,big_data_Oracle Database,big_data_Snowflake,big_data_Microsoft Azure SQL Database,big_data_Amazon RDS,big_data_MySQL,big_data_Google Cloud BigTable,big_data_Amazon Redshift,big_data_PostgreSQL,big_data_MongoDB,big_data_Google Cloud SQL,big_data_Google Cloud BigQuery,big_data_Microsoft Azure Cosmos DB,big_data_Amazon Aurora,big_data_Microsoft SQL Server,big_data_None,big_data_SQLite,big_data_IBM Db2,big_data_Google Cloud Spanner,big_data_Other,BI_tools_Salesforce,BI_tools_Tableau CRM,BI_tools_SAP Analytics Cloud,BI_tools_Sisense,BI_tools_Tableau,BI_tools_TIBCO Spotfire,BI_tools_Microsoft Azure Synapse,BI_tools_Microsoft Power BI,BI_tools_Looker,BI_tools_Domo,BI_tools_Amazon QuickSight,BI_tools_Alteryx,BI_tools_Thoughtspot,BI_tools_None,BI_tools_Other,BI_tools_Google Data Studio,BI_tools_Qlik
0,50-59,Man,India,Asia & Oceania,Other,None of these activities are an important par...,3-4,small,5-10 years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, R",Vim / Emacs,PostgreSQL,,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",Colab Notebooks,"GitHub , Kaggle",5-10 years,5+ years,,1.0,,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,50-59,Man,Indonesia,Asia & Oceania,Program/Project Manager,Build and/or run the data infrastructure that...,1-2,small,20+ years,Senior,"Advanced statistical software (SPSS, SAS, etc.)",Python,"SQL, C, C++, Java","Notepad++,Jupyter Notebook",,,Matplotlib,"Kaggle Notebooks,Colab Notebooks",,< 1 year,< 2 years,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [40]:
df_unos.to_csv('../datos/datos_divididos_unos.csv')

## Reemplazamos 1.0s por valores

In [56]:
lm.unos_valores(df_divididos, *sv.lista_columnas_reemplazar)

In [57]:
df_divididos.head(2)

Unnamed: 0,age,gender,country,continent,job_title,work_activities,size_DA_dept,dept size,years_programming,programming_experience,primary_data_tool,first_language_rec,dev_language,IDE,big_data,BI_tools,visualisation,notebooks,sharing,ML,ML_experience,notebooks_KaggleNotebooks,notebooks_ColabNotebooks,notebooks_AzureNotebooks,notebooks_Paperspace/Gradient,notebooks_Binder/JupyterHub,notebooks_CodeOcean,notebooks_IBMWatsonStudio,notebooks_AmazonSagemakerStudioNotebooks,notebooks_AmazonEMRNotebooks,notebooks_GoogleCloudNotebooks(AIPlatform/VertexAI),notebooks_GoogleCloudDatalab,notebooks_DatabricksCollaborativeNotebooks,notebooks_Zeppelin/ZeplNotebooks,notebooks_DeepnoteNotebooks,notebooks_ObservableNotebooks,notebooks_None,notebooks_Other,sharing_PlotlyDash,sharing_Streamlit,sharing_NBViewer,sharing_GitHub,sharing_Personalblog,sharing_Kaggle,sharing_Colab,sharing_Shiny,sharing_does_not_share,sharing_Other,dev_language_Javascript,dev_language_SQL,dev_language_Swift,dev_language_Other,dev_language_R,dev_language_MATLAB,dev_language_C,dev_language_C++,dev_language_Bash,dev_language_Python,dev_language_Java,dev_language_None,dev_language_Julia,IDE_RStudio,IDE_Vim / Emacs,IDE_Spyder,IDE_Jupyter (JupyterLab; Jupyter Notebooks; etc),IDE_MATLAB,IDE_Jupyter Notebook,IDE_Visual Studio Code (VSCode),IDE_Notepad++,IDE_Visual Studio,IDE_PyCharm,IDE_None,IDE_Other,IDE_Sublime Text,visualisation_Leaflet / Folium,visualisation_Matplotlib,visualisation_Shiny,visualisation_D3 js,visualisation_Bokeh,visualisation_Seaborn,visualisation_Ggplot / ggplot2,visualisation_Plotly / Plotly Express,visualisation_Altair,visualisation_None,visualisation_Other,visualisation_Geoplotlib,activities_ML_research,activities_None,activities_Other,activities_improve_ML,activities_ML_prototypes,activities_analyze_data,activities_run_ML,activities_data_infrastructure,big_data_Google Cloud Firestore,big_data_Amazon DynamoDB,big_data_Oracle Database,big_data_Snowflake,big_data_Microsoft Azure SQL Database,big_data_Amazon RDS,big_data_MySQL,big_data_Google Cloud BigTable,big_data_Amazon Redshift,big_data_PostgreSQL,big_data_MongoDB,big_data_Google Cloud SQL,big_data_Google Cloud BigQuery,big_data_Microsoft Azure Cosmos DB,big_data_Amazon Aurora,big_data_Microsoft SQL Server,big_data_None,big_data_SQLite,big_data_IBM Db2,big_data_Google Cloud Spanner,big_data_Other,BI_tools_Salesforce,BI_tools_Tableau CRM,BI_tools_SAP Analytics Cloud,BI_tools_Sisense,BI_tools_Tableau,BI_tools_TIBCO Spotfire,BI_tools_Microsoft Azure Synapse,BI_tools_Microsoft Power BI,BI_tools_Looker,BI_tools_Domo,BI_tools_Amazon QuickSight,BI_tools_Alteryx,BI_tools_Thoughtspot,BI_tools_None,BI_tools_Other,BI_tools_Google Data Studio,BI_tools_Qlik
0,50-59,Man,India,Asia & Oceania,Other,None of these activities are an important par...,3-4,small,5-10 years,Senior,"Local development environments (RStudio, Jupyt...",Python,"Python, R",Vim / Emacs,PostgreSQL,,"Matplotlib ,Seaborn ,Ggplot / ggplot2 ,Shiny ...",Colab Notebooks,"GitHub , Kaggle",5-10 years,5+ years,,Colab Notebooks,,,,,,,,,,,,,,,,,,,GitHub,,Kaggle,,,,,0.0,0.0,0.0,0.0,R,0.0,0.0,0.0,0.0,Python,0.0,0.0,0.0,0.0,Vim / Emacs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Leaflet / Folium,Matplotlib,Shiny,0.0,0.0,Seaborn,Ggplot / ggplot2,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,PostgreSQL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
1,50-59,Man,Indonesia,Asia & Oceania,Program/Project Manager,Build and/or run the data infrastructure that...,1-2,small,20+ years,Senior,"Advanced statistical software (SPSS, SAS, etc.)",Python,"SQL, C, C++, Java","Notepad++,Jupyter Notebook",,,Matplotlib,"Kaggle Notebooks,Colab Notebooks",,< 1 year,< 2 years,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,SQL,0.0,0.0,0.0,0.0,C,C++,0.0,0.0,Java,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Jupyter Notebook,0.0,Notepad++,0.0,0.0,0.0,0.0,0.0,0.0,Matplotlib,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,data_infrastructure,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,18-29,Man,Pakistan,Asia & Oceania,Software Engineer,None of these activities are an important par...,0,small,1-3 years,Junior,"Basic statistical software (Microsoft Excel, G...",Python,"Python, C++, Java","PyCharm ,Jupyter Notebook, Other","MySQL , MongoDB",,Matplotlib,Kaggle Notebooks,I do not share my work publicly,I do not use machine learning methods,,Kaggle Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,I do not share my work publicly,,0.0,0.0,0.0,0.0,0.0,0.0,C,C++,0.0,Python,Java,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Jupyter Notebook,0.0,0.0,0.0,PyCharm,0.0,Other,0.0,0.0,Matplotlib,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MySQL,0.0,0.0,0.0,MongoDB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
3,40-49,Man,Mexico,America,Research Scientist,Do research that advances the state of the ar...,0,small,20+ years,Senior,"Local development environments (RStudio, Jupyt...",Python,Python,"Spyder,Jupyter Notebook",,,Matplotlib,Colab Notebooks,,5-10 years,5+ years,,Colab Notebooks,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Python,0.0,0.0,0.0,0.0,0.0,Spyder,0.0,0.0,Jupyter Notebook,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matplotlib,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ML_research,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,40-49,Man,India,Asia & Oceania,Other,Analyze and understand data to influence produ...,5-9,medium,< 1 years,Junior,"Local development environments (RStudio, Jupyt...",Python,"Python, C, MATLAB","Spyder,MATLAB ,Jupyter Notebook",,Microsoft Power BI,"Matplotlib ,Seaborn ,Ggplot / ggplot2",Google Cloud Datalab,I do not share my work publicly,10-20 years,5+ years,,,,,,,,,,,Google Cloud Datalab,,,,,,,,,,,,,,,I do not share my work publicly,,0.0,0.0,0.0,0.0,0.0,MATLAB,C,0.0,0.0,Python,0.0,0.0,0.0,0.0,0.0,Spyder,0.0,MATLAB,Jupyter Notebook,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matplotlib,0.0,0.0,0.0,Seaborn,Ggplot / ggplot2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ML_prototypes,analyze_data,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Microsoft Power BI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25968,30-39,Man,Egypt,Africa,Data Analyst,Analyze and understand data to influence produ...,20+,large,1-3 years,Junior,"Business intelligence software (Salesforce, Ta...",Python,"Python, SQL, Javascript, Bash","Jupyter (JupyterLab; Jupyter Notebooks; etc) ,...","PostgreSQL , SQLite , MongoDB , Microsoft SQL...",Microsoft Power BI,"Matplotlib ,Seaborn","Colab Notebooks, Amazon Sagemaker Studio Noteb...","GitHub , Colab",1-2 years,< 2 years,,Colab Notebooks,,,,,,Amazon Sagemaker Studio Notebooks,Amazon EMR Notebooks,,,Databricks Collaborative Notebooks,,,,,,,,,GitHub,,,Colab,,,,Javascript,SQL,0.0,0.0,0.0,0.0,0.0,0.0,Bash,Python,Java,0.0,0.0,0.0,Vim / Emacs,0.0,Jupyter (JupyterLab; Jupyter Notebooks; etc),0.0,Jupyter Notebook,Visual Studio Code (VSCode),0.0,Visual Studio,0.0,0.0,0.0,0.0,0.0,Matplotlib,0.0,0.0,0.0,Seaborn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,analyze_data,0.0,data_infrastructure,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Amazon Redshift,PostgreSQL,MongoDB,0.0,0.0,0.0,0.0,Microsoft SQL Server,0.0,SQLite,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Microsoft Power BI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25969,18-29,Man,China,Asia & Oceania,Student,,,,1-3 years,Junior,,Python,Python,PyCharm,,,,,,1-2 years,< 2 years,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Python,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,PyCharm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25970,50-59,Man,Sweden,Europe,Research Scientist,None of these activities are an important par...,20+,large,I have never written code,Sin experiencia,"Basic statistical software (Microsoft Excel, G...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25971,40-49,Man,United States of America,America,Data Scientist,,20+,large,5-10 years,Senior,,Python,"Python, SQL","Notepad++,Jupyter Notebook",,,"Matplotlib ,Seaborn",,,4-5 years,2-5 years,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,SQL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Python,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Jupyter Notebook,0.0,Notepad++,0.0,0.0,0.0,0.0,0.0,0.0,Matplotlib,0.0,0.0,0.0,Seaborn,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
#df_divididos.to_csv('../datos/datos_sin_unos.csv')