In [1]:
# Constante
DATA_PATH = '/home/saitama/Project/first_endtoend_project/data/raw/survey_results_public.csv' 
EXPORT_PATH = "../data/processed/1_preprocessed_df.pkl"

REPLACE_DICT = {
    'Age1stCode': {'Younger than 5 years': 4, 'Older than 85': 86},
    'YearsCodePro': {'Less than 1 year': 0, 'More than 50 years': 51}, 
    'YearsCode':    {'Less than 1 year': 0, 'More than 50 years': 51},
    }
# Columns has multiple answers on each value
COLUMNS_NEED_SPLIT = ["DatabaseDesireNextYear", "DatabaseWorkedWith",
                      "DevType",
                      "LanguageWorkedWith", "LanguageDesireNextYear",
                      "NEWCollabToolsDesireNextYear", "NEWCollabToolsWorkedWith",
                      "OpSys", "PlatformDesireNextYear", "PlatformWorkedWith",
                      "UndergradMajor",
                      "MiscTechDesireNextYear", "MiscTechWorkedWith",
                      "WebframeDesireNextYear", "WebframeWorkedWith"]

# Columns need to be integer 
COLUMNS_HAVE_PROB = ["Age1stCode", "YearsCodePro", "YearsCode"]
COLUMNS_TO_USE = COLUMNS_HAVE_PROB + COLUMNS_NEED_SPLIT

COLUMNS_FOR_JOB_HUNT = ["NEWJobHuntResearch", "NEWJobHunt"]

***
<h3 id="heading"><span style="background-color:#cefffb; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 📥 Import packages & data </span></h3>

In [2]:
# Load necessary package 
import pandas as pd
import numpy as np
import pickle
import logging
import math
import random
import os

pd.options.display.max_rows = 50000

raw_df = pd.read_csv(DATA_PATH)

***
<h2 id="heading"><span style="background-color:#cefffb; color:#1b4946; display:fill;border-radius:5px; font-family:cursive"> 📢 Helper Function  </span></h2>

In [3]:
# Function 
def split_answers(column_serie, separator=';'):
    """
    Split multiple answers in a single string 
    to a list of single strings each represnting a single answers 
    
    Parametres:
    * column_Serie (pd.Series): string serie with multiple answers
    * separator (String): an other string that 
    
    return: (pd.Serie) contain serie of list 
    """
    
    # Sub-function checking if they are multiple answers
    def multi_value_checker(pd_series, separator):
        """ Check if the answer can be splitted by the separator - Returns boolean """
        return pd_series.str.contains(pat=separator)
    
    # Sub-function split each answer
    def split_answer(pd_series, separator):
        """ Split the string value by the separator - return list """
        return pd_series.str.split(separator) 

    # Check if the column_serie is splitable
    val_splitable = multi_value_checker(column_serie, separator)
    if not val_splitable.any():
        return column_serie
        
    # If val_splitable.any() is True then split the column
    column_splited = split_answer(column_serie, separator)
                                  
    # Replace the empty value by an empty list
    null_answers_bool = column_splited.isnull()
    column_splited.loc[null_answers_bool] = column_splited.loc[null_answers_bool].apply(lambda x: [])
    
    return column_splited

***
<h2 id="heading"><span style="background-color:#cefffb; color:#1b4946; display:fill;border-radius:5px; font-family:cursive"> 🧮 Preprocessing </span></h2>

<h4 id="heading"><span style="background-color:#fcf3b9; color:#3d3710 ; display:fill;border-radius:5px; font-family:Georgia"> 📋 Preprocessing Data </span></h4>

In [4]:
df = raw_df.copy()

<h4 id="heading"><span style="background-color:#fcf3b9; color:#3d3710 ; display:fill;border-radius:5px; font-family:Georgia">1. 🔨🪛 Split Multiple Value
</span></h4>

In [5]:
COLUMNS_SPLIT = df.select_dtypes('object').columns.to_list()
for column in COLUMNS_SPLIT:
    df[column] = split_answers(df[column],';')

<h4 id="heading"><span style="background-color:#fcf3b9; color:#3d3710 ; display:fill;border-radius:5px; font-family:Georgia">2. 🔨🪛 Replace Values </span></h4>

In [6]:
# Change string value by the dictionary REPLACE_DICT (Pandas Remap Values)
df[COLUMNS_HAVE_PROB] = df[COLUMNS_HAVE_PROB].replace(REPLACE_DICT).astype(np.float32)

# Replace null value
for column in COLUMNS_HAVE_PROB:
    df[column].fillna(df[column].mean(), inplace = True)


***
<h2 id="heading"><span style="background-color:#cefffb; color:#1b4946; display:fill;border-radius:5px; font-family:cursive"> 💱 Check the change  </span></h2>

In [7]:
test_col = random.choice(df.columns)
n_sample = random.randint(0, len(df))
print(test_col + '\n'+'-'*20)
print('In The Modified Data :', df[test_col].iloc[n_sample])
print('In The Actuel Data   :', raw_df[test_col].iloc[n_sample])

NEWStuck
--------------------
In The Modified Data : ['Call a coworker or friend', 'Visit Stack Overflow', 'Go for a walk or other physical activity', 'Panic', 'Do other work and come back later']
In The Actuel Data   : Call a coworker or friend;Visit Stack Overflow;Go for a walk or other physical activity;Panic;Do other work and come back later


***
<h2 id="heading"><span style="background-color:#cefffb; color:#1b4946; display:fill;border-radius:5px; font-family:cursive"> 📤 Exporting Pickle </span></h2>

In [8]:
EXPORT_PATH

'../data/processed/1_preprocessed_df.pkl'

In [9]:
with open(EXPORT_PATH, 'wb') as f:
    pickle.dump(df, f)

# We can also use: $ df.to_pickle(EXPORT_PATH)




In [10]:
# For import the pickle from the path
xr= pd.read_pickle(EXPORT_PATH)

### **🏅The index of person that  was has less than 5 years when they start coding**

In [11]:
print("The index of person that  was has more than 85 years when they start coding:\n\n",
      raw_df[raw_df["Age1stCode"]=="Older than 85"].index.values)
print("--"*30)
print("The number of person that  was has more than 85 years when they start coding:-->",
      len(raw_df[raw_df["Age1stCode"]=="Older than 85"].index.values))

The index of person that  was has more than 85 years when they start coding:

 [ 2625  6125  6964 25796 33363 35362 37797 39170 41380 42259 42328 45732
 64288]
------------------------------------------------------------
The number of person that  was has more than 85 years when they start coding:--> 13


 1. **Display the row of Respondent who has more than 85 years when they start coding**

In [12]:
 raw_df[raw_df["Age1stCode"]=="Older than 85"].sample(1).iloc[0]

Respondent                                                                   6150
MainBranch                                         I am a developer by profession
Hobbyist                                                                      Yes
Age                                                                          25.0
Age1stCode                                                          Older than 85
CompFreq                                                                      NaN
CompTotal                                                                     NaN
ConvertedComp                                                                 NaN
Country                                                               South Korea
CurrencyDesc                                                                  NaN
CurrencySymbol                                                                NaN
DatabaseDesireNextYear                                                        NaN
DatabaseWorkedWi

### **🏅The index of person that  was has less than 5 years when they start coding**

In [13]:
print("The index of person that  was has less than 5 years when they start coding:\n\n",
      raw_df[raw_df["Age1stCode"]=="Younger than 5 years"].index.values)
print("--"*30)
print("The number of person that  was has less than 5 years when they start coding:-->",
      len(raw_df[raw_df["Age1stCode"]=="Younger than 5 years"].index.values))

The index of person that  was has less than 5 years when they start coding:

 [  209   372   534   566   634   732   872   948  1001  1446  1553  1567
  1906  1972  2159  2266  2287  2846  3264  3484  3710  3840  3876  3931
  4043  4080  4279  4944  5601  5630  5774  5814  5919  5930  5952  5999
  6142  6205  6382  6419  6815  7068  7244  7278  7370  7371  7763  7859
  7994  8103  8198  8750  8851  8859  9101  9139  9360  9461  9802  9936
 10050 10166 10511 10653 10702 10839 11017 11285 11337 11407 11600 11781
 11938 11946 12004 12344 12429 12465 12634 12783 12882 12987 13035 13210
 13555 13847 13973 14058 14322 14529 14714 15004 15039 15335 15509 15530
 15558 15992 16151 17061 17210 17319 17517 17545 17641 17937 18024 18259
 18723 18919 18982 19003 19305 19587 19824 19874 19927 19961 20238 20331
 20410 20576 20700 20902 20946 21084 21140 21165 21457 21518 22431 22794
 22891 23047 23060 23091 23356 23365 23543 23708 24105 24110 24124 24357
 24675 24738 24894 24908 24944 25286 25330 253

 1. **Display the row of Respondent who has more than 85 years when they start coding**

In [14]:
 raw_df[raw_df["Age1stCode"]=="Younger than 5 years"].sample(1).iloc[0]

Respondent                                                                  52325
MainBranch                                         I am a developer by profession
Hobbyist                                                                      Yes
Age                                                                          32.0
Age1stCode                                                   Younger than 5 years
CompFreq                                                                   Yearly
CompTotal                                                                 95000.0
ConvertedComp                                                             95000.0
Country                                                             United States
CurrencyDesc                                                 United States dollar
CurrencySymbol                                                                USD
DatabaseDesireNextYear                                                        NaN
DatabaseWorkedWi

In [15]:
print(raw_df.columns.tolist())

['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq', 'CompTotal', 'ConvertedComp', 'Country', 'CurrencyDesc', 'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith', 'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors', 'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith', 'MiscTechDesireNextYear', 'MiscTechWorkedWith', 'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps', 'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch', 'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms', 'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites', 'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear', 'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount', 'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength', 'Trans', 'UndergradMajor', 'WebframeDesireNextYear', 'WebframeWorkedWith', 'WelcomeChange', 'WorkWeekHrs', 'YearsCode', 'YearsCodePro']


# 👨‍‍🔧 **Preprocessing**



## 🔬👓 Exploring Columns values

### 1️⃣ "MainBranch" 

In [16]:
print(raw_df["MainBranch"].unique().tolist())

['I am a developer by profession', 'I code primarily as a hobby', 'I used to be a developer by profession, but no longer am', 'I am not primarily a developer, but I write code sometimes as part of my work', 'I am a student who is learning to code', nan]


In [17]:
# Change the value of the VALUE_TEST from any value of the  MainBranch column
# VALUE_TEST = "I am a developer by profession"
# VALUE_TEST = "I code primarily as a hobby"
VALUE_TEST = "I used to be a developer by profession, but no longer am"
# VALUE_TEST = "I am not primarily a developer, but I write code sometimes as part of my work"
# VALUE_TEST = "I am a student who is learning to code"
raw_df[raw_df["MainBranch"]==VALUE_TEST].sample(1).iloc[0]

Respondent                                                                  26686
MainBranch                      I used to be a developer by profession, but no...
Hobbyist                                                                       No
Age                                                                          28.0
Age1stCode                                                                     15
CompFreq                                                                      NaN
CompTotal                                                                     NaN
ConvertedComp                                                                 NaN
Country                                                                      Iran
CurrencyDesc                                                                  NaN
CurrencySymbol                                                                NaN
DatabaseDesireNextYear                               Microsoft SQL Server;MongoDB
DatabaseWorkedWi

In [18]:
raw_df[raw_df["MainBranch"]==VALUE_TEST].shape
raw_df[raw_df["MainBranch"]==VALUE_TEST].describe()
raw_df[raw_df["MainBranch"]==VALUE_TEST].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1183 entries, 4 to 64156
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Respondent                    1183 non-null   int64  
 1   MainBranch                    1183 non-null   object 
 2   Hobbyist                      1183 non-null   object 
 3   Age                           751 non-null    float64
 4   Age1stCode                    994 non-null    object 
 5   CompFreq                      0 non-null      object 
 6   CompTotal                     0 non-null      float64
 7   ConvertedComp                 0 non-null      float64
 8   Country                       1168 non-null   object 
 9   CurrencyDesc                  0 non-null      object 
 10  CurrencySymbol                0 non-null      object 
 11  DatabaseDesireNextYear        722 non-null    object 
 12  DatabaseWorkedWith            840 non-null    object 
 13  De

In [19]:
print(f"The mean age of coresponding that has '{VALUE_TEST}' is :",raw_df[raw_df["MainBranch"]==VALUE_TEST]["Age"].mean())

The mean age of coresponding that has 'I used to be a developer by profession, but no longer am' is : 39.86817576564581


In [20]:
raw_df

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,...,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64456,64858,,Yes,,16,,,,United States,,...,,,,"Computer science, computer engineering, or sof...",,,,,10,Less than 1 year
64457,64867,,Yes,,,,,,Morocco,,...,,,,,,,,,,
64458,64898,,Yes,,,,,,Viet Nam,,...,,,,,,,,,,
64459,64925,,Yes,,,,,,Poland,,...,,,,,Angular;Angular.js;React.js,,,,,
