In [61]:
# Constants
# declaring data path
RAW_DATA_PATH = "../data/raw/survey_results_public.csv"
CLEANED_DATA_PATH = "../data/processed/processed_df.pkl"
#  Columns of interest
SELECTED_COLUMNS =['DevType', 'EdLevel', 'YearsCode', 'YearsCodePro',
    'LanguageHaveWorkedWith', 'LanguageWantToWorkWith',
    'DatabaseHaveWorkedWith', 'DatabaseWantToWorkWith',
    'PlatformHaveWorkedWith', 'PlatformWantToWorkWith',
    'WebframeHaveWorkedWith', 'WebframeWantToWorkWith',
    'MiscTechHaveWorkedWith', 'MiscTechWantToWorkWith',
    'ToolsTechHaveWorkedWith', 'ToolsTechWantToWorkWith',
    'OpSysProfessional use', 'ProfessionalTech'
]

In [3]:
# importing needed libraries
import pandas as pd
import numpy as np

In [4]:
raw_df= pd.read_csv(DATA_PATH)[SELECTED_COLUMNS]

In the previous notebook, we conclude the processing we need to do:

1- YearsCodePro , YearsCode : 
    * Data type converstion(from object -> int) 
    * convert ['Less than 1 year','More than 50 years'] to suitable value.
    * convert nan to suitable value.

2- DevType:
    * nan to suitable value.
    * Removing rows with values is in: ['Marketing or sales professional' ,'Designer', 'Student', 'Other (please specify):']
    
3- Split multiple answers that are seperated by ';' in skills and languages related columns.

#### Functions that we will need in the preprocessing:

In [45]:
def clean_date(df: pd.DataFrame, col: str) -> pd.Series:
    """
    This function cleans the specified column (`col`) in the given DataFrame (`df`).
    It replaces 'Less than 1 year' value with 0 and 'More than 50 years' with 51,
    converts the column from object to integer dtype, and replaces NaN values with the median of the column.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the column to be cleaned.
    col (str): The name of the column to be cleaned.

    Returns:
    pd.Series: The cleaned column as a Series.
    """
    # Replace 'Less than 1 year' value by 0 and 'More than 50 years' by 51
    df[col] = df[col].replace(['Less than 1 year', 'More than 50 years'], [0, 51])

    # Convert column from object to float dtype
    df[col] = df[col].astype(float)    

    # Replace NaN values with median
    df[col] = df[col].fillna(df[col].median())

    return df[col]

def clean_target(df:pd.DataFrame,col:str)->pd.DataFrame:
    """
    This function removes rows from the given DataFrame (`df`) where the specified column (`col`) contains values in the following list:
    ['Marketing or sales professional' ,'Designer', 'Student', 'Other (please specify):',nan]

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the column to be cleaned.
    col (str): The name of the column to be cleaned.

    Returns:
    pd.DataFrame: The DataFrame with rows containing the specified values removed.
    """
    # Remove rows with the specified values
    df = df[~df[col].isin(['Marketing or sales professional' ,'Designer', 'Student', 'Other (please specify):',np.nan])]

    return df
    
def split_multiple_answers(df:pd.DataFrame,splitter:str=';')->pd.DataFrame:
    """
    This function splits multiple answers that are separated by a specific delimiter in the given DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the columns to be split.
    splitter (str): The delimiter used to separate the values in the columns. Default is ';'

    Returns:
    pd.DataFrame: The DataFrame with the split columns.
    """

    def is_splittable_column(df:pd.DataFrame,col:str)-> bool:
        """
        This function checks if the specified column contains the splitter string value.

        Parameters:
        df (pd.DataFrame): The input DataFrame containing the column to be checked.
        col (str): The name of the column to be checked.

        Returns:
        bool: True if the column contains the splitter string value, False otherwise.
        """
        return df[col].str.contains(splitter).any()
        
    def split_values_in_column(df:pd.DataFrame,col:str)->pd.Series:
        """
        This function splits the values in the specified column (`col`) in the given DataFrame (`df`) using the splitter string value.

        Parameters:
        df (pd.DataFrame): The input DataFrame containing the column to be split.
        col (str): The name of the column to be split.
        splitter (str): The string value to be used as a separator when splitting the values in the column.

        Returns:
        pd.Series: The split column as a Series.
        """
        # Split the column values using the specified splitter    
        return df[col].str.split(splitter, expand=False)

    for col in df.select_dtypes(include=['object']):
        if(is_splittable_column(df,col)):
            df[col]= split_values_in_column(df,col)
    return df


## Preprocessing Years related columns

In [33]:
raw_df['YearsCode']= clean_date(raw_df,'YearsCode')
raw_df['YearsCodePro']= clean_date(raw_df,'YearsCodePro')


## Preprocessing target column

In [34]:
df= clean_target(raw_df,'DevType')

In [35]:
df.shape

(71366, 18)

In [36]:
df['DevType'].unique()

array(['Senior Executive (C-Suite, VP, etc.)', 'Developer, back-end',
       'Developer, front-end', 'Developer, full-stack',
       'System administrator',
       'Developer, desktop or enterprise applications',
       'Developer, QA or test',
       'Data scientist or machine learning specialist',
       'Data or business analyst', 'Security professional', 'Educator',
       'Research & Development role', 'Developer, mobile',
       'Database administrator',
       'Developer, embedded applications or devices', 'Engineer, data',
       'Hardware Engineer', 'Product manager', 'Academic researcher',
       'Developer, game or graphics', 'Cloud infrastructure engineer',
       'Engineering manager', 'Developer Experience', 'Project manager',
       'DevOps specialist', 'Engineer, site reliability', 'Blockchain',
       'Developer Advocate', 'Scientist'], dtype=object)

## Preprocessing skills and languages related columns

In [58]:
df=split_multiple_answers(df.copy())

## Visually Verifying Preprocessed DataFrame

In [63]:
df.sample(5)


Unnamed: 0,DevType,EdLevel,YearsCode,YearsCodePro,LanguageHaveWorkedWith,LanguageWantToWorkWith,DatabaseHaveWorkedWith,DatabaseWantToWorkWith,PlatformHaveWorkedWith,PlatformWantToWorkWith,WebframeHaveWorkedWith,WebframeWantToWorkWith,MiscTechHaveWorkedWith,MiscTechWantToWorkWith,ToolsTechHaveWorkedWith,ToolsTechWantToWorkWith,OpSysProfessional use,ProfessionalTech
47045,"Developer, full-stack","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",3.0,3.0,"[Bash/Shell (all shells), HTML/CSS, JavaScript...","[HTML/CSS, Java, Python, Ruby, SQL]",[PostgreSQL],"[Dynamodb, Firebase Realtime Database, Postgre...","[Amazon Web Services (AWS), Digital Ocean, Ren...","[Amazon Web Services (AWS), Digital Ocean, Fly...","[jQuery, Vue.js, WordPress]","[Django, Express, FastAPI, Flask, jQuery, Nest...",,,"[APT, Chocolatey, Make]","[APT, Chocolatey, Docker, Homebrew, Make]","[Android, MacOS, Ubuntu, Windows, Windows Subs...",
16202,"Developer, full-stack","Secondary school (e.g. American high school, G...",6.0,3.0,"[Bash/Shell (all shells), HTML/CSS, JavaScript...","[Elixir, Go, Python, Rust, SQL, TypeScript]","[BigQuery, MySQL, PostgreSQL, Redis, SQLite]","[Cassandra, Elasticsearch, MongoDB, Neo4J, Pos...","[Amazon Web Services (AWS), Cloudflare, Fireba...","[Amazon Web Services (AWS), Cloudflare]","[Express, Node.js, React, Svelte]","[Express, Fastify, Next.js, Node.js, Phoenix, ...",[React Native],"[Apache Kafka, Capacitor, Electron, Hugging Fa...","[Docker, Gradle, Homebrew, npm, Pip, Podman, V...","[Ansible, Bun, Cargo, Docker, Kubernetes, npm,...","[MacOS, Ubuntu]","[DevOps function, Automated testing, AI-assist..."
18803,"Developer, full-stack","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",5.0,3.0,"[HTML/CSS, JavaScript]","[HTML/CSS, JavaScript, Solidity]",[MongoDB],"[Dynamodb, MongoDB]","[Amazon Web Services (AWS), Cloudflare, Fireba...","[Amazon Web Services (AWS), Cloudflare, Fireba...","[Express, jQuery, Node.js, React]","[Express, Node.js, React]",,,"[Ant, npm, Yarn]","[Ant, npm]","[Android, Windows]",[None of these]
83779,"Developer, full-stack","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",9.0,6.0,"[C#, JavaScript, PowerShell, SQL, TypeScript]","[C#, Crystal, JavaScript, Ruby, SQL, TypeScript]","[Clickhouse, Microsoft SQL Server, PostgreSQL,...","[Clickhouse, PostgreSQL]",[Amazon Web Services (AWS)],[Amazon Web Services (AWS)],"[ASP.NET CORE, Express, jQuery, React]","[ASP.NET CORE, React, Ruby on Rails]",[.NET (5+) ],[.NET (5+) ],"[Chocolatey, Docker, Homebrew, MSBuild, npm, N...","[Docker, Homebrew, npm, NuGet, Vite, Webpack]","[iPadOS, Windows, Windows Subsystem for Linux ...","[DevOps function, Automated testing, Developer..."
29789,"Developer, back-end","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",43.0,34.0,"[Java, SQL]","[Java, SQL]","[IBM DB2, Microsoft SQL Server]","[IBM DB2, Microsoft SQL Server]",,,,,,,[Maven (build tool)],[Maven (build tool)],"[Windows, Other (Please Specify):]",[None of these]


## Exporting Preprocessed DataFrame in Pickle format

In [62]:

df.to_pickle("../data/processed/processed_df.pkl")