In [1]:
DATA_URL = r'https://cdn.stackoverflow.co/files/jo7n4k8s/production/49915bfd46d0902c3564fd9a06b509d08a20488c.zip/stack-overflow-developer-survey-2023.zip'
DATA_PATH = '../data/raw/'
SURVEY_FILENAME = 'survey_results_public.csv'

In [2]:
import pandas as pd 
import urllib.request
import zipfile
import os

pd.options.display.max_rows = 10000

### Functions

In [3]:
def download_and_unzip(url, extract_path):

  zip_name = url.split("/")[-1]
  with urllib.request.urlopen(url) as response:
    with open(f"{extract_path}/{zip_name}", "wb") as file:
      file.write(response.read())

  with zipfile.ZipFile(f"{extract_path}/{zip_name}", "r") as zip_file:
    zip_file.extractall(extract_path)

  return zip_name

## Explore

In [4]:
zip_name = download_and_unzip(DATA_URL, DATA_PATH)

In [5]:
results_path = os.path.join(DATA_PATH, SURVEY_FILENAME)

In [6]:
# Read data and print shape
raw_df = pd.read_csv(results_path)
raw_df.shape

(89184, 84)

In [7]:
# Display random answer 
# Observations: Multiple answers need to be splitted 
# Reference to the schema needed to understand
raw_df.sample(1).iloc[0]

ResponseId                                                                          6854
Q120                                                                             I agree
MainBranch                                                I am a developer by profession
Age                                                                      25-34 years old
Employment                                                           Employed, full-time
RemoteWork                                          Hybrid (some remote, some in-person)
CodingActivities                                Hobby;Contribute to open-source projects
EdLevel                                     Bachelor’s degree (B.A., B.S., B.Eng., etc.)
LearnCode                              Books / Physical media;On the job training;Oth...
LearnCodeOnline                        Formal documentation provided by the owner of ...
LearnCodeCoursesCert                                                                 NaN
YearsCode            

In [8]:
# Print the general information of the data frame 
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89184 entries, 0 to 89183
Data columns (total 84 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   ResponseId                           89184 non-null  int64  
 1   Q120                                 89184 non-null  object 
 2   MainBranch                           89184 non-null  object 
 3   Age                                  89184 non-null  object 
 4   Employment                           87898 non-null  object 
 5   RemoteWork                           73810 non-null  object 
 6   CodingActivities                     73764 non-null  object 
 7   EdLevel                              87973 non-null  object 
 8   LearnCode                            87663 non-null  object 
 9   LearnCodeOnline                      70084 non-null  object 
 10  LearnCodeCoursesCert                 37076 non-null  object 
 11  YearsCode                   

In [9]:
# Get stats for the numerical column
raw_df.describe()

Unnamed: 0,ResponseId,CompTotal,WorkExp,ConvertedCompYearly
count,89184.0,48225.0,43579.0,48019.0
mean,44592.5,1.036807e+42,11.405126,103110.1
std,25745.347541,2.276847e+44,9.051989,681418.8
min,1.0,0.0,0.0,1.0
25%,22296.75,63000.0,5.0,43907.0
50%,44592.5,115000.0,9.0,74963.0
75%,66888.25,230000.0,16.0,121641.0
max,89184.0,5e+46,50.0,74351430.0


In [10]:
# Investigate the questionable objects columns
questionable_cols = ['YearsCodePro', 'YearsCode']

for col in questionable_cols: 
    print(col)
    print(raw_df[col].unique().tolist())
    print('--------------------------')
    print()

YearsCodePro
[nan, '9', '23', '7', '4', '21', '3', '15', 'Less than 1 year', '10', '2', '6', '14', '5', '19', '13', '16', '28', '1', '30', '11', '8', '25', '32', '24', '40', '17', '45', '29', '12', '31', '20', '18', '50', '27', '43', '22', '26', '38', '33', '44', '35', '34', '37', '42', '41', 'More than 50 years', '47', '36', '39', '48', '46', '49']
--------------------------

YearsCode
[nan, '18', '27', '12', '6', '21', '4', '5', '20', '14', '10', '15', '11', '3', '24', '8', '13', 'Less than 1 year', '16', '33', '22', '30', '32', '7', '35', '28', '40', '17', '29', '19', 'More than 50 years', '9', '38', '26', '34', '25', '2', '45', '23', '31', '43', '1', '48', '41', '50', '39', '42', '37', '36', '44', '46', '49', '47']
--------------------------

