# dtale Setup

In [1]:
import dtale
import pandas as pd

df = pd.read_csv('../../data/normalized_normalized_survey.csv')

# launch dtale in browser
d = dtale.show(df, port=4000)
print(d._main_url)

http://HwaRonZ:4000/dtale/main/1


In [2]:
# end dtale session
d.kill()

2024-12-12 23:41:52,647 - INFO     - Executing shutdown...
2024-12-12 23:41:52,650 - INFO     - Not running with the Werkzeug Server, exiting by searching gc for BaseWSGIServer


# Simple EDA Questions

## General Data Exploration

#### 1. What is the distribution of age among respondents?

In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

# Enable offline mode for Plotly in Jupyter
init_notebook_mode(connected=True)

# Your dataset
df = pd.read_csv('../../data/normalized_normalized_survey.csv')

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
    df = df.to_frame(index=False)

# Remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['What is your age (# years)?'])]['What is your age (# years)?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'What is your age (# years)? Value Counts'},
    'xaxis': {'title': {'text': 'What is your age (# years)?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))

# Display the chart
iplot(figure)

#### 2. What is the gender distribution?

In [2]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
from plotly.offline import iplot, init_notebook_mode

init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['What is your gender?'])]['What is your gender?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'What is your gender? Value Counts'},
    'xaxis': {'title': {'text': 'What is your gender?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))

iplot(figure)

#### 3. What is the distribution of respondents across different countries?

In [3]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'


if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['In which country do you currently reside?'])]['In which country do you currently reside?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'In which country do you currently reside? Value Counts'},
    'xaxis': {'title': {'text': 'In which country do you currently reside?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))

iplot(figure)

## Education and Professional Background

#### 4. What are the highest levels of formal education attained by respondents?

In [4]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

# Enable Plotly in offline mode
init_notebook_mode(connected=True)



# If the DataFrame has DateTimeIndex or MultiIndex, convert it
if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
    df = df.to_frame(index=False)

# Remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # Ensure column names are strings

# Filter and prepare the data for plotting
s = df[~pd.isnull(df['What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'])]['What is the highest level of formal education that you have attained or plan to attain within the next 2 years?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]

# Create the bar chart
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'What is the highest level of formal education that you have attained or plan to attain within the next 2 years? Value Counts'},
    'xaxis': {'title': {'text': 'What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))

iplot(figure)

#### 5. What are the most common job titles among respondents?

In [5]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'


if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['Select the title most similar to your current role (or most recent title if retired)'])]['Select the title most similar to your current role (or most recent title if retired)']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'Select the title most similar to your current role (or most recent title if retired) Value Counts'},
    'xaxis': {'title': {'text': 'Select the title most similar to your current role (or most recent title if retired)'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))

iplot(figure)

#### 6. What is the distribution of company sizes where respondents are employed?

In [6]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['What is the size of the company where you are employed?'])]['What is the size of the company where you are employed?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'What is the size of the company where you are employed? Value Counts'},
    'xaxis': {'title': {'text': 'What is the size of the company where you are employed?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

#### 7. How many individuals are responsible for data science workloads in different companies?

In [7]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['Approximately how many individuals are responsible for data science workloads at your place of business?'])]['Approximately how many individuals are responsible for data science workloads at your place of business?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': ('Approximately how many individuals are responsible for data science workloads at your place of business?? Value Counts')},
    'xaxis': {'title': {'text': ('Approximately how many individuals are responsible for data science workloads at your place of business??')}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

## Machine Learning Integration & Usage

#### 8. What percentage of respondents' employers incorporate machine learning methods?

In [8]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['Does your current employer incorporate machine learning methods into their business?'])]['Does your current employer incorporate machine learning methods into their business?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'Does your current employer incorporate machine learning methods into their business? Value Counts'},
    'xaxis': {'title': {'text': 'Does your current employer incorporate machine learning methods into their business?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

#### 9. What is the distribution of the number of years respondents have used machine learning methods?

In [9]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['For how many years have you used machine learning methods?'])]['For how many years have you used machine learning methods?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'For how many years have you used machine learning methods? Value Counts'},
    'xaxis': {'title': {'text': 'For how many years have you used machine learning methods?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

#### 10. What programming languages are recommended by respondents for aspiring data scientists to learn first?

In [10]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['What programming language would you recommend an aspiring data scientist to learn first?'])]['What programming language would you recommend an aspiring data scientist to learn first?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': ('What programming language would you recommend an aspiring data scientist to learn first?? Value Counts')},
    'xaxis': {'title': {'text': ('What programming language would you recommend an aspiring data scientist to learn first??')}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

## Compensation & Spending

#### 11. What is the distribution of current yearly compensation for respondents?

In [11]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['What is your current yearly compensation (approximate $USD)?'])]['What is your current yearly compensation (approximate $USD)?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'What is your current yearly compensation (approximate $USD)? Value Counts'},
    'xaxis': {'title': {'text': 'What is your current yearly compensation (approximate $USD)?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

#### 12. How much have respondents spent on machine learning and/or cloud computing products in the past 5 years?

In [12]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['Approximately how much money have you spent on machine learning and/or cloud computing products at your work in the past 5 years?'])]['Approximately how much money have you spent on machine learning and/or cloud computing products at your work in the past 5 years?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': ('Approximately how much money have you spent on machine learning and/or cloud computing products at your work in the past 5 years?? Value Counts')},
    'xaxis': {'title': {'text': ('Approximately how much money have you spent on machine learning and/or cloud computing products at your work in the past 5 years??')}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

#### 13. What is the relationship between yearly compensation and money spent on machine learning/cloud computing?

In [16]:
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import re

# Enable Plotly in offline mode
init_notebook_mode(connected=True)

# Load your dataset
df = pd.read_csv('../../data/normalized_normalized_survey.csv')

# Ensure the column names are correctly referenced
compensation_column = 'What is your current yearly compensation (approximate $USD)?'
spending_column = 'Approximately how much money have you spent on machine learning and/or cloud computing products at your work in the past 5 years?'

# Function to convert range strings to average values
def convert_to_avg(value):
    if isinstance(value, str):
        match = re.match(r'(\d+),(\d+)-(\d+),(\d+)', value)
        if match:
            low = int(match.group(1) + match.group(2))
            high = int(match.group(3) + match.group(4))
            return (low + high) / 2
    return value

# Apply the conversion function
df[compensation_column] = df[compensation_column].apply(convert_to_avg).astype(float, errors='ignore')
df[spending_column] = df[spending_column].apply(convert_to_avg).astype(float, errors='ignore')

# Filter and prepare the data for plotting
df = df.dropna(subset=[compensation_column, spending_column])
df[compensation_column] = pd.to_numeric(df[compensation_column], errors='coerce')
df[spending_column] = pd.to_numeric(df[spending_column], errors='coerce')

# Create the scatter plot
scatter = go.Scatter(
    x=df[compensation_column],
    y=df[spending_column],
    mode='markers',
    marker=dict(size=10, color='blue', opacity=0.6),
    text=df['What is your gender?'],  # Optional: add gender or other column for hover text
    name='Compensation vs. Spending'
)

# Create the figure
figure = go.Figure(data=[scatter], layout=go.Layout({
    'title': {'text': 'Relationship between Yearly Compensation and Money Spent on ML/Cloud Computing'},
    'xaxis': {'title': {'text': 'Yearly Compensation (USD)'}},
    'yaxis': {'title': {'text': 'Money Spent on ML/Cloud Computing (USD)'}},
}))

# Display the chart in the Jupyter notebook
iplot(figure)

## Tools & Technologies

#### 14. What are the primary tools used for data analysis at work or school?

In [17]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['What is the primary tool that you use at work or school to analyze data?'])]['What is the primary tool that you use at work or school to analyze data?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'What is the primary tool that you use at work or school to analyze data? Value Counts'},
    'xaxis': {'title': {'text': 'What is the primary tool that you use at work or school to analyze data?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

#### 15. How long have respondents been writing code to analyze data?

In [18]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['How long have you been writing code to analyze data (at work or at school)?'])]['How long have you been writing code to analyze data (at work or at school)?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'How long have you been writing code to analyze data (at work or at school)? Value Counts'},
    'xaxis': {'title': {'text': 'How long have you been writing code to analyze data (at work or at school)?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

#### 16. What data visualization libraries or tools are most commonly used?

In [19]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['What data visualization libraries or tools do you use on a regular basis?'])]['What data visualization libraries or tools do you use on a regular basis?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'What data visualization libraries or tools do you use on a regular basis? Value Counts'},
    'xaxis': {'title': {'text': 'What data visualization libraries or tools do you use on a regular basis?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

#### 17. Which machine learning algorithms and frameworks are used regularly?

In [20]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['Which of the following ML algorithms do you use on a regular basis?'])]['Which of the following ML algorithms do you use on a regular basis?']
chart = pd.value_counts(s).to_frame(name='data')
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'Which of the following ML algorithms do you use on a regular basis? Value Counts'},
    'xaxis': {'title': {'text': 'Which of the following ML algorithms do you use on a regular basis?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

## Additional

#### 18. What are the favorite media sources for data science topics?

In [21]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['Who/what are your favorite media sources that report on data science topics?'])]['Who/what are your favorite media sources that report on data science topics?']
chart = pd.value_counts(s.str.split(expand=True).stack())
chart = chart.to_frame(name='data').sort_index()
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'Who/what are your favorite media sources that report on data science topics? Word Value Counts'},
    'xaxis': {'title': {'text': 'Who/what are your favorite media sources that report on data science topics?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

#### 19. On which platforms have respondents completed data science courses?

In [22]:
# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

import numpy as np
import pandas as pd
import plotly.graph_objs as go

# If you're having trouble viewing your chart in your notebook try passing your 'chart' into this snippet:
#
from plotly.offline import iplot, init_notebook_mode
#
init_notebook_mode(connected=True)
for chart in charts:
    chart.pop('id', None) # for some reason iplot does not like 'id'

if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
	df = df.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

s = df[~pd.isnull(df['On which platforms have you begun or completed data science courses?'])]['On which platforms have you begun or completed data science courses?']
chart = pd.value_counts(s.str.split(expand=True).stack())
chart = chart.to_frame(name='data').sort_index()
chart['percent'] = (chart['data'] / chart['data'].sum()) * 100
chart.index.name = 'labels'
chart = chart.reset_index().sort_values(['data', 'labels'], ascending=[False, True])
chart = chart[:100]
charts = [go.Bar(x=chart['labels'].values, y=chart['data'].values, name='Frequency')]
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h'},
    'title': {'text': 'On which platforms have you begun or completed data science courses? Word Value Counts'},
    'xaxis': {'title': {'text': 'On which platforms have you begun or completed data science courses?'}},
    'yaxis': {'title': {'text': 'Frequency'}}
}))


iplot(figure)

# Advanced EDA Questions

## Demographic Analysis

#### Age Distribution by Gender: How does the age distribution vary between different genders?

In [23]:
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

# Enable Plotly in offline mode
init_notebook_mode(connected=True)


# If the DataFrame has DateTimeIndex or MultiIndex, convert it
if isinstance(df, (pd.DatetimeIndex, pd.MultiIndex)):
    df = df.to_frame(index=False)

# Remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
df = df.reset_index().drop('index', axis=1, errors='ignore')
df.columns = [str(c) for c in df.columns]  # Ensure column names are strings

# Filter data to include only 'Male' and 'Female' genders
chart_data = df[df['What is your gender?'].isin(['Female', 'Male'])]


chart_data = chart_data.rename(columns={'What is your age (# years)?': 'age'})

# Group by age and gender and count the occurrences
chart_data_count = chart_data.groupby(['What is your gender?', 'age']).size().reset_index(name='count')


female_data = chart_data_count[chart_data_count['What is your gender?'] == 'Female']
male_data = chart_data_count[chart_data_count['What is your gender?'] == 'Male']


charts = [
    go.Bar(x=female_data['age'], y=female_data['count'], name='Female'),
    go.Bar(x=male_data['age'], y=male_data['count'], name='Male')
]

# Create the figure
figure = go.Figure(data=charts, layout=go.Layout({
    'barmode': 'group',
    'legend': {'orientation': 'h', 'y': -0.3},
    'title': {'text': 'Age Distribution by Gender'},
    'xaxis': {'title': {'text': 'Age'}},
    'yaxis': {'title': {'text': 'Count'}, 'type': 'linear'}
}))

iplot(figure)