In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import re  
import warnings
from pathlib import Path
warnings.filterwarnings("ignore")
%matplotlib inline
plt.rcParams['figure.figsize'] = [8,5]
plt.rcParams['font.size'] =14
plt.rcParams['font.weight']= 'bold'
sns.set()

In [2]:
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.datasets as datasets 
import torchvision.transforms as tt

In [3]:
df = pd.read_csv('Downloads/stack-overflow-developer-survey-2021/survey_results_public.csv')

In [4]:
df.sample(1).iloc[0]

ResponseId                                                                  70549
MainBranch                                            I code primarily as a hobby
Employment                                                    I prefer not to say
Country                                                  United States of America
US_State                                                                     Iowa
UK_Country                                                                    NaN
EdLevel                                                            Something else
Age1stCode                                                           5 - 10 years
LearnCode                       Other online resources (ex: videos, blogs, etc...
YearsCode                                                                       7
YearsCodePro                                                                  NaN
DevType                                                                       NaN
OrgSize         

In [5]:
#here i'm choosing the columns, which is contains (;) to make them as a list 
to_split = df.columns[16:30].tolist()
to_split.append('DevType')

In [6]:
# i do it like that because i want to split my columns values according to (;) , which is will not
# work because the Nan Values
df = df.fillna("missing")

In [7]:
# check if it will work
df[to_split[0]].apply(lambda x : x.split(';')).sample(1).iloc[0]

['C#', 'HTML/CSS', 'JavaScript', 'SQL']

In [8]:
for i in to_split:
    df[i] = df[i].apply(lambda x: x.split(';'))

In [9]:
df.sample(1).iloc[0]

ResponseId                                                                  68770
MainBranch                                         I am a developer by profession
Employment                                                     Employed full-time
Country                                                                    Brazil
US_State                                                                  missing
UK_Country                                                                missing
EdLevel                              Bachelor’s degree (B.A., B.S., B.Eng., etc.)
Age1stCode                                                          25 - 34 years
LearnCode                       Coding Bootcamp;Other online resources (ex: vi...
YearsCode                                                                      10
YearsCodePro                                                                    7
DevType                                                     [Developer, back-end]
OrgSize         

### I Think it worked will 

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83439 entries, 0 to 83438
Data columns (total 48 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   ResponseId                    83439 non-null  int64 
 1   MainBranch                    83439 non-null  object
 2   Employment                    83439 non-null  object
 3   Country                       83439 non-null  object
 4   US_State                      83439 non-null  object
 5   UK_Country                    83439 non-null  object
 6   EdLevel                       83439 non-null  object
 7   Age1stCode                    83439 non-null  object
 8   LearnCode                     83439 non-null  object
 9   YearsCode                     83439 non-null  object
 10  YearsCodePro                  83439 non-null  object
 11  DevType                       83439 non-null  object
 12  OrgSize                       83439 non-null  object
 13  Currency        

In [11]:
#u will notice that there is something wrong in columns (YearsCode,YearsCodePro)
#because it can not be Object so, we need to handle it
#-------------------------------------------------------------
#here that is the hard way to check what's wrong in your data but why i did it ? 
#if i have many of unique values and i used (column).unique may be i can not see what's wrong 
#so i decide to generalize my problem so, i used regex to solve it 
#if u want to use the easy way, u can use df[specific_column].unique() and look for what is happen
check_columns = ['YearsCode','YearsCodePro']
print(df[~df[check_columns[0]].str.contains('^(\d+|missing)$')]['YearsCode'].unique())
print(df[~df[check_columns[1]].str.contains('^(\d+|missing)$')]['YearsCodePro'].unique())

['Less than 1 year' 'More than 50 years']
['Less than 1 year' 'More than 50 years']


In [12]:
#there is no reason for using 51 instead of 52....etc it just make sense to me 
map_values = {
    'Less than 1 year':'0',
    'More than 50 years':'51'
}

In [13]:
for i in check_columns:
    df[i] = df[i].apply(lambda x : x if x not in map_values.keys() else map_values[x])


In [14]:
for i in check_columns:
    print(df[i].unique())

['missing' '7' '17' '3' '4' '6' '16' '12' '15' '10' '40' '9' '26' '14'
 '39' '20' '8' '19' '5' '0' '22' '2' '1' '34' '21' '13' '25' '24' '30'
 '31' '18' '38' '51' '27' '41' '42' '35' '23' '28' '11' '37' '44' '43'
 '36' '33' '45' '29' '50' '46' '32' '47' '49' '48']
['missing' '10' '4' '5' '6' '2' '30' '9' '18' '12' '21' '1' '16' '0' '15'
 '3' '35' '7' '8' '17' '14' '26' '25' '20' '50' '34' '11' '24' '22' '13'
 '31' '23' '39' '41' '27' '28' '19' '33' '51' '37' '29' '32' '43' '40'
 '38' '45' '42' '46' '36' '44' '47' '48' '49']


### seems everything is good

In [16]:
#check the whole cahanges
df.sample(1).iloc[0]

ResponseId                                                                  16495
MainBranch                                         I am a developer by profession
Employment                      Independent contractor, freelancer, or self-em...
Country                                                                    Turkey
US_State                                                                  missing
UK_Country                                                                missing
EdLevel                              Bachelor’s degree (B.A., B.S., B.Eng., etc.)
Age1stCode                                                           5 - 10 years
LearnCode                       Other online resources (ex: videos, blogs, etc...
YearsCode                                                                      10
YearsCodePro                                                                    2
DevType                         [Academic researcher, Student, Developer, embe...
OrgSize         