In [1]:
import pickle
from scipy import stats 
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
import warnings
import sidetable
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from geopy.geocoders import Nominatim

plt.rcParams["figure.figsize"] = (10,8)

pd.options.display.max_columns = None

In [2]:
df = pd.read_csv('data/encuesta_analistas.csv', index_col=0)
df.sample()

Unnamed: 0,edad,genero,pais,formacion,ocupacion,lenguajes_programacion,lenguajes_recomendados,IDE,plataforma_computacion,visualizacion,sector,tamaño_empresa,equipo,actividades,salario,productos_big_data,herramienta_data_analytics,notebooks,plataformas_nube,uso_plataformas,formacion_data,fuentes_online
5057,45-49,Man,Japan,No formal education past high school,Data Analyst,"Python, SQL",Python,"Jupyter (JupyterLab, Jupyter Notebooks, etc) , Vim / Emacs,Jupyter Notebook","A cloud computing platform (AWS, Azure, GCP, hosted notebooks, etc)",Matplotlib,Computers/Technology,"1000-9,999 employees",5-9,"Analyze and understand data to influence product or business decisions, Build and/or r...","80,000-89,999",,"Cloud-based data software & APIs (AWS, GCP, Azure, etc.)","Google Cloud Datalab, Amazon Sagemaker Studio Notebooks , Kaggle Notebooks",Google Cloud Platform (GCP),"Amazon Elastic Compute Cloud (EC2) , Google Cloud Compute Engine",,"Twitter (data science influencers), Kaggle (notebooks, forums, etc)"


In [3]:
df_visualizacion = pd.DataFrame(df['visualizacion'])
df_visualizacion

Unnamed: 0,visualizacion
16,"Matplotlib ,Seaborn ,Plotly / Plotly Express"
32,Ggplot / ggplot2
33,
46,"Matplotlib ,Plotly / Plotly Express"
52,"Plotly / Plotly Express ,Ggplot / ggplot2 ,Shiny"
...,...
25954,Matplotlib
25959,"Matplotlib ,Seaborn , Other"
25965,"Matplotlib ,Seaborn ,Plotly / Plotly Express"
25966,"Matplotlib ,Seaborn ,Ggplot / ggplot2"


In [4]:
def exploracion(df):

    print(f"El dataframe tiene {df.shape[0]} filas y {df.shape[1]} columnas")  
    print("---")
    print(df.info())
    print("---")
    columnas = [df.columns[num] for num in range(len(df.columns))]
    print(f"Los nombres de las columnas son {columnas}")
    print("---")
    print(f"A continuación se muestran los principales estadísticos numéricos del dataframe {df}:")
    print(df.describe().T)
    print(f"A continuación se muestran los principales estadísticos categóricos del dataframe {df}:")
    print(df.describe(include = "object").T)
    print("---")
    print("Los porcentajes de valores nulos por cada columna son:")
    print(df.isnull().sum() * 100/df.shape[0])
    print("---")
    valores_unicos = [len(df[col].value_counts())for col in columnas]
    valores_unicos_por_columna = list(zip(columnas, valores_unicos))
    print("Estos son los valores únicos por cada columna")
    print(valores_unicos_por_columna)
    print("---")
    print("Los valores duplicados por cada columna son:")
    print(df.duplicated().sum())

In [5]:
df

Unnamed: 0,edad,genero,pais,formacion,ocupacion,lenguajes_programacion,lenguajes_recomendados,IDE,plataforma_computacion,visualizacion,sector,tamaño_empresa,equipo,actividades,salario,productos_big_data,herramienta_data_analytics,notebooks,plataformas_nube,uso_plataformas,formacion_data,fuentes_online
16,50-54,Man,Belgium,Bachelor’s degree,Data Analyst,"Python, SQL",Python,"Jupyter (JupyterLab, Jupyter Notebooks, etc) ,Jupyter Notebook",A laptop,"Matplotlib ,Seaborn ,Plotly / Plotly Express",Energy/Mining,"1000-9,999 employees",5-9,Analyze and understand data to influence product or business decisions,"2,000-2,999",,"Local development environments (RStudio, JupyterLab, etc.)","Databricks Collaborative Notebooks , Kaggle Notebooks, Colab Notebooks",,,"Coursera, Kaggle Learn Courses, Cloud-certification programs (direct from AWS, Azure, ...","Kaggle (notebooks, forums, etc), Blogs (Towards Data Science, Analytics Vidhya, etc)"
32,22-24,Nonbinary,United States of America,Some college/university study without earning a bachelor’s degree,Data Analyst,R,R,RStudio,A personal computer / desktop,Ggplot / ggplot2,Manufacturing/Fabrication,250-999 employees,3-4,,,,,Google Cloud Datalab,,,,
33,30-34,Woman,Egypt,Bachelor’s degree,Data Analyst,Python,R,"Notepad++,Jupyter Notebook",A laptop,,Non-profit/Service,0-49 employees,0,Analyze and understand data to influence product or business decisions,"7,500-9,999",,"Basic statistical software (Microsoft Excel, Google Sheets, etc.)","Kaggle Notebooks, Colab Notebooks",,,"Other, DataCamp, Kaggle Learn Courses, Coursera, University Courses (resulting in a un...","Email newsletters (Data Elixir, O'Reilly Data & AI, etc), Kaggle (notebooks, forums, e..."
46,22-24,Man,China,Some college/university study without earning a bachelor’s degree,Data Analyst,"Python, SQL, C, C++, Java, Javascript, MATLAB",Python,"Jupyter (JupyterLab, Jupyter Notebooks, etc) ,Visual Studio ,Visual Studio Code (VSCod...",A personal computer / desktop,"Matplotlib ,Plotly / Plotly Express",Computers/Technology,0-49 employees,0,"Analyze and understand data to influence product or business decisions, Build and/or r...",$0-999,,"Basic statistical software (Microsoft Excel, Google Sheets, etc.)","Kaggle Notebooks, Azure Notebooks",,,"Coursera, Kaggle Learn Courses, edX","Twitter (data science influencers), Kaggle (notebooks, forums, etc), YouTube (Kaggle Y..."
52,40-44,Man,South Africa,Master’s degree,Data Analyst,"R, SQL",R,"RStudio ,Jupyter Notebook",A laptop,"Plotly / Plotly Express ,Ggplot / ggplot2 ,Shiny",Manufacturing/Fabrication,250-999 employees,1-2,Analyze and understand data to influence product or business decisions,"25,000-29,999",,"Local development environments (RStudio, JupyterLab, etc.)",IBM Watson Studio,IBM Cloud / Red Hat,,"Coursera, Udemy","YouTube (Kaggle YouTube, Cloud AI Adventures, etc), Blogs (Towards Data Science, Analy..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25954,25-29,Man,India,Professional doctorate,Data Analyst,Python,Python,Jupyter Notebook,A laptop,Matplotlib,Online Service/Internet-based Services,250-999 employees,0,"Analyze and understand data to influence product or business decisions, Do research th...",$0-999,,,Binder / JupyterHub,,,Coursera,
25959,18-21,Man,India,Bachelor’s degree,Data Analyst,"Python, SQL",Python,"Visual Studio ,Visual Studio Code (VSCode) ,Jupyter Notebook",A personal computer / desktop,"Matplotlib ,Seaborn , Other",Academics/Education,0-49 employees,0,Analyze and understand data to influence product or business decisions,$0-999,,"Business intelligence software (Salesforce, Tableau, Spotfire, etc.)","Kaggle Notebooks, Google Cloud Notebooks (AI Platform / Vertex AI) , Colab Notebooks",,,"Udacity, Cloud-certification programs (direct from AWS, Azure, GCP, or similar), Cours...","Email newsletters (Data Elixir, O'Reilly Data & AI, etc), YouTube (Kaggle YouTube, Clo..."
25965,18-21,Man,India,Master’s degree,Data Analyst,"Python, SQL, C",Python,"Jupyter (JupyterLab, Jupyter Notebooks, etc) ,Visual Studio Code (VSCode) ,PyCharm , S...",A personal computer / desktop,"Matplotlib ,Seaborn ,Plotly / Plotly Express",,,,,,,,"Kaggle Notebooks, Colab Notebooks",,,,
25966,30-34,Man,India,Bachelor’s degree,Data Analyst,"Python, C",Python,"PyCharm ,Jupyter Notebook","A cloud computing platform (AWS, Azure, GCP, hosted notebooks, etc)","Matplotlib ,Seaborn ,Ggplot / ggplot2",Manufacturing/Fabrication,"10,000 or more employees",20+,Experimentation and iteration to improve existing ML models,"3,000-3,999",,,Google Cloud Datalab,,,,


In [6]:
df_visualizacion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2301 entries, 16 to 25968
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   visualizacion  2108 non-null   object
dtypes: object(1)
memory usage: 36.0+ KB


In [7]:
exploracion(df_visualizacion)

El dataframe tiene 2301 filas y 1 columnas
---
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2301 entries, 16 to 25968
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   visualizacion  2108 non-null   object
dtypes: object(1)
memory usage: 36.0+ KB
None
---
Los nombres de las columnas son ['visualizacion']
---
A continuación se muestran los principales estadísticos numéricos del dataframe                                            visualizacion
16         Matplotlib ,Seaborn ,Plotly / Plotly Express 
32                                     Ggplot / ggplot2 
33                                                  None
46                  Matplotlib ,Plotly / Plotly Express 
52     Plotly / Plotly Express ,Ggplot / ggplot2 ,Shiny 
...                                                  ...
25954                                        Matplotlib 
25959                        Matplotlib ,Seaborn , Other
25965      Matplotl

In [8]:
df.dtypes

edad                          object
genero                        object
pais                          object
formacion                     object
ocupacion                     object
lenguajes_programacion        object
lenguajes_recomendados        object
IDE                           object
plataforma_computacion        object
visualizacion                 object
sector                        object
tamaño_empresa                object
equipo                        object
actividades                   object
salario                       object
productos_big_data            object
herramienta_data_analytics    object
notebooks                     object
plataformas_nube              object
uso_plataformas               object
formacion_data                object
fuentes_online                object
dtype: object

In [9]:
df_visualizacion

Unnamed: 0,visualizacion
16,"Matplotlib ,Seaborn ,Plotly / Plotly Express"
32,Ggplot / ggplot2
33,
46,"Matplotlib ,Plotly / Plotly Express"
52,"Plotly / Plotly Express ,Ggplot / ggplot2 ,Shiny"
...,...
25954,Matplotlib
25959,"Matplotlib ,Seaborn , Other"
25965,"Matplotlib ,Seaborn ,Plotly / Plotly Express"
25966,"Matplotlib ,Seaborn ,Ggplot / ggplot2"


In [10]:
df_visualizacion['visualizacion'] = df_visualizacion['visualizacion'].str.replace('/', ' ')

In [11]:
df_visualizacion

Unnamed: 0,visualizacion
16,"Matplotlib ,Seaborn ,Plotly Plotly Express"
32,Ggplot ggplot2
33,
46,"Matplotlib ,Plotly Plotly Express"
52,"Plotly Plotly Express ,Ggplot ggplot2 ,Shiny"
...,...
25954,Matplotlib
25959,"Matplotlib ,Seaborn , Other"
25965,"Matplotlib ,Seaborn ,Plotly Plotly Express"
25966,"Matplotlib ,Seaborn ,Ggplot ggplot2"


In [15]:
df_notebooks = pd.DataFrame(df['notebooks'])
df_notebooks

Unnamed: 0,notebooks
16,"Databricks Collaborative Notebooks , Kaggle Notebooks, Colab Notebooks"
32,Google Cloud Datalab
33,"Kaggle Notebooks, Colab Notebooks"
46,"Kaggle Notebooks, Azure Notebooks"
52,IBM Watson Studio
...,...
25954,Binder / JupyterHub
25959,"Kaggle Notebooks, Google Cloud Notebooks (AI Platform / Vertex AI) , Colab Notebooks"
25965,"Kaggle Notebooks, Colab Notebooks"
25966,Google Cloud Datalab


In [16]:
df_sum_not = df_notebooks[df_notebooks['notebooks'] == 'Colab Notebooks']

In [19]:
df_sum_not.value_counts()

notebooks      
Colab Notebooks    237
dtype: int64

In [27]:
df_spliteado

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
16,Matplotlib,Seaborn,Plotly Plotly Express,,,,,,,
32,Ggplot ggplot2,,,,,,,,,
33,,,,,,,,,,
46,Matplotlib,Plotly Plotly Express,,,,,,,,
52,Plotly Plotly Express,Ggplot ggplot2,Shiny,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
25954,Matplotlib,,,,,,,,,
25959,Matplotlib,Seaborn,Other,,,,,,,
25965,Matplotlib,Seaborn,Plotly Plotly Express,,,,,,,
25966,Matplotlib,Seaborn,Ggplot ggplot2,,,,,,,


In [45]:
df_split_vis = df["visualizacion"].str.split(",").apply(pd.Series)
df_split_vis


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
16,Matplotlib,Seaborn,Plotly / Plotly Express,,,,,,,
32,Ggplot / ggplot2,,,,,,,,,
33,,,,,,,,,,
46,Matplotlib,Plotly / Plotly Express,,,,,,,,
52,Plotly / Plotly Express,Ggplot / ggplot2,Shiny,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
25954,Matplotlib,,,,,,,,,
25959,Matplotlib,Seaborn,Other,,,,,,,
25965,Matplotlib,Seaborn,Plotly / Plotly Express,,,,,,,
25966,Matplotlib,Seaborn,Ggplot / ggplot2,,,,,,,


In [52]:
df_split_vis.value_counts()

0             1         2                         3                  4       5       6        7       8            9                
 Matplotlib   Seaborn   Plotly / Plotly Express   Ggplot / ggplot2   Shiny   D3 js   Altair   Bokeh   Geoplotlib   Leaflet / Folium     5
dtype: int64

In [51]:
df_borrar = df_split_vis.dropna([2],how='all', inplace=True)

TypeError: supplying multiple axes to axis is no longer supported.

In [35]:
df_split_not = df["notebooks"].str.split(",").apply(pd.Series)
df_split_not

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
16,Databricks Collaborative Notebooks,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,
32,Google Cloud Datalab,,,,,,,,,,,,,,,
33,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,,
46,Kaggle Notebooks,Azure Notebooks,,,,,,,,,,,,,,
52,IBM Watson Studio,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25954,Binder / JupyterHub,,,,,,,,,,,,,,,
25959,Kaggle Notebooks,Google Cloud Notebooks (AI Platform / Vertex AI),Colab Notebooks,,,,,,,,,,,,,
25965,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,,
25966,Google Cloud Datalab,,,,,,,,,,,,,,,


In [36]:
df_split_not.stb.missing()

Unnamed: 0,missing,total,percent
15,2300,2301,99.956541
9,2297,2301,99.826163
10,2297,2301,99.826163
11,2297,2301,99.826163
12,2297,2301,99.826163
13,2297,2301,99.826163
14,2297,2301,99.826163
8,2295,2301,99.739244
7,2294,2301,99.695784
6,2291,2301,99.565406


In [34]:
df_spliteado.stb.missing()

Unnamed: 0,missing,total,percent
9,2296,2301,99.782703
8,2295,2301,99.739244
7,2291,2301,99.565406
6,2280,2301,99.087353
5,2256,2301,98.044329
4,2171,2301,94.350282
3,1967,2301,85.484572
2,1533,2301,66.623207
1,893,2301,38.809213
0,193,2301,8.387658


In [37]:
import re

columnas_colab = df_split_not.filter(regex=re.compile('Colab Notebooks', flags=re.IGNORECASE)).columns

In [38]:
columnas_colab

Int64Index([], dtype='int64')

In [41]:
valor_buscado = 'Colab Notebooks'

for columna in columnas_colab:
    if df_split_not[columna].eq(valor_buscado).any():
        print(f"El valor {valor_buscado} se encuentra en la columna {columna}.")

In [43]:
import re

valor_buscado = 'Colab Notebooks'
patron_columnas = 'Colab Notebooks'
conteo_total= 0

for columna in df_split_not:
    conteo_columna = df_split_not[columna].eq(valor_buscado).sum()
    conteo_total += conteo_columna
    print(f"La columna {columna} contiene {conteo_columna} ocurrencias del valor {valor_buscado}.")

print(f"En total, se encontraron {conteo_total} ocurrencias del valor")

La columna 0 contiene 243 ocurrencias del valor Colab Notebooks.
La columna 1 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 2 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 3 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 4 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 5 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 6 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 7 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 8 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 9 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 10 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 11 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 12 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 13 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 14 contiene 0 ocurrencias del valor Colab Notebooks.
La columna 15 contiene 0 ocurrencias del valor C

In [44]:
valor_buscado = 'Kaggle Notebooks'
patron_columnas = 'Kaggle Notebooks'
conteo_total= 0

for columna in df_split_not:
    conteo_columna = df_split_not[columna].eq(valor_buscado).sum()
    conteo_total += conteo_columna
    print(f"La columna {columna} contiene {conteo_columna} ocurrencias del valor {valor_buscado}.")

print(f"En total, se encontraron {conteo_total} ocurrencias del valor")

La columna 0 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 1 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 2 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 3 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 4 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 5 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 6 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 7 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 8 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 9 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 10 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 11 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 12 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 13 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 14 contiene 0 ocurrencias del valor Kaggle Notebooks.
La columna 15 contiene 0 ocurrencia