# Data Preprocessing

In [11]:
# Constants
DATA_PATH   = "../Data/Raw/survey_results_public2022.csv"
EXPORT_PATH = "../Data/Processed/1_preprocessed_df.pkl"

REPLACE_YEARS_TEXT = {
    'YearsCodePro': {'Less than 1 year': 0, 'More than 50 years': 51}, 
    'YearsCode':    {'Less than 1 year': 0, 'More than 50 years': 51}}

In [5]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
from pathlib import Path

______

### Functions

In [9]:
# Create a Folder named Images to save figures in.
IMAGES_PATH = Path.cwd().parent / "Images"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    """
    This functions will save the current figure shown below.
    
    Args:
        fig_id: String Containing the name of the figure.
        tight_layout: Boolean to decide whether you want a tight layout or not.
        fig_extension: String to decide the type of the figure.
        resoultion: Int to decide the resolution of the figure.
        
    Returns:
        None
    """
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    
    if tight_layout:
        plt.tight_layout()
        
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
def print_unique_values(df, columns):
    """
    Print the unique values for each categorical column and there count in the DataFrame.

    Args:
        df (DataFrame): DataFrame containing categorical columns.
        columns (list): Array of column names to loop through.

    Returns:
        None
    """
    for col in columns:
        value_counts = df[col].value_counts().head(5)
        unique_count = len(df[col].unique())
        print(f"Unique values of {col}:\nNo. of Unique values: {unique_count}\n{value_counts}' \n")

______

In [7]:
# Load dataset and make a copy
raw_df = pd.read_csv(DATA_PATH)
df = raw_df.copy()

In [16]:
# Check for duplicates
raw_df.duplicated().value_counts()

False    73268
dtype: int64

- **The Dataset contains no duplicate values.**

- **Changes need to be done:**
    - **Change the text values in `YearsCode` and `YearsCodePro` to numerical values**
    - **The features that contain multiple answers in each row need to be splitted** 
    - **The skills inside the features that end with `HaveWorkedWith` or `WantToWorkWith` need to be combined.** 
    - **Return the only needed features**

- **Go through the schema of the survey to identify each feature and the question it asks to identify its importance for our business case**
    - **Unuseful features:**
        - `ResponseId`, `SurveyEase`, `SurveyLength`
        - `TrueFalse_1` to `TrueFalse_3`, `Frequency_1` to `Frequency_3`, `Knowledge_1` to `knowledge_7`
        - `Onboarding`, `TimeSearching`, `TimeAnswering`, `ICorPM`, `TBranch`
        - `Trans`, `Sexuality`, `Ethnicity`, `Accessibility`, `MentalHealth`, `Age`, `Gender`, `Blockchain`    
        - `SOComm`, `NEWSOSites`, `SOVisitFreq`, `SOPartFreq`,`SOAccount`, `BuyNewTool`, `PurchaseInfluence`, 
        - `OfficeStackAsyncHaveWorkedWith`, `OfficeStackAsyncWantToWorkWith`, `OfficeStackSyncHaveWorkedWith`, `OfficeStackSyncWantToWorkWith`
        - `VCHostingPersonal use` , `VCHostingProfessional use`
        - `OpSysProfessional use`,  `OpSysPersonal use`,
        
        
    - **Might be useful features:** 
        - `Employment`,`RemoteWork`,
        - `MainBranch`,`CodingActivities`, `ProfessionalTech`,
        - `LearnCode`, `LearnCodeOnline`, `LearnCodeCoursesCert`
        - `WorkExp`, `YearsCode`, `YearsCodePro`, `EdLevel`,
        - `OrgSize`, `Country`,
        - `ConvertedCompYearly`, `Currency`, `CompTotal`, `CompFreq`
        
    - **Core features:** 
        - `DevType`
        - `VersionControlSystem`,  `VCInteraction`,  
        - `LanguageHaveWorkedWith`,  `LanguageWantToWorkWith`,  
        - `DatabaseHaveWorkedWith`,  `DatabaseWantToWorkWith`,  
        - `PlatformHaveWorkedWith`,  `PlatformWantToWorkWith`,  
        - `WebframeHaveWorkedWith`,  `WebframeWantToWorkWith`,  
        - `MiscTechHaveWorkedWith`,`MiscTechWantToWorkWith`,  
        - `ToolsTechHaveWorkedWith`,`ToolsTechWantToWorkWith`,                                    
        - `NEWCollabToolsHaveWorkedWith`,  `NEWCollabToolsWantToWorkWith`,  