# Data Preprocessing

In [33]:
# Constants
DATA_PATH   = "../Data/Raw/survey_results_public2022.csv"
EXPORT_PATH = "../Data/Processed/2_preprocessed_df.pkl"

ROLE_COL = ['DevType']
CORE_COLS = [
             'VersionControlSystem',  
             'LanguageHaveWorkedWith',  'LanguageWantToWorkWith',  
             'DatabaseHaveWorkedWith',  'DatabaseWantToWorkWith',  
             'PlatformHaveWorkedWith',  'PlatformWantToWorkWith',  
             'WebframeHaveWorkedWith',  'WebframeWantToWorkWith',  
             'MiscTechHaveWorkedWith','MiscTechWantToWorkWith',  
             'ToolsTechHaveWorkedWith','ToolsTechWantToWorkWith',                                    
             'NEWCollabToolsHaveWorkedWith',  'NEWCollabToolsWantToWorkWith'
]

USEFUL_COLS= ['Employment','RemoteWork',
              'MainBranch','CodingActivities','ProfessionalTech',
              'LearnCode', 'LearnCodeOnline', 'LearnCodeCoursesCert',
              'WorkExp', 'YearsCode', 'YearsCodePro', 'EdLevel',
              'OrgSize', 'Country',
              'ConvertedCompYearly', 'Currency', 'CompTotal', 'CompFreq']

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
from pathlib import Path

pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000

______

### Functions

In [3]:
# Create a Folder named Images to save figures in.
IMAGES_PATH = Path.cwd().parent / "Images"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    """
    This functions will save the current figure shown below.
    
    Args:
        fig_id: String Containing the name of the figure.
        tight_layout: Boolean to decide whether you want a tight layout or not.
        fig_extension: String to decide the type of the figure.
        resoultion: Int to decide the resolution of the figure.
        
    Returns:
        None
    """
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    
    if tight_layout:
        plt.tight_layout()
        
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [4]:
def print_unique_values(df, columns):
    """
    Print the unique values for each categorical column and there count in the DataFrame.

    Args:
        df (DataFrame): DataFrame containing categorical columns.
        columns (list): List of column names to loop through.

    Returns:
        None
    """
    for col in columns:
        value_counts = df[col].value_counts().head(5)
        unique_count = len(df[col].unique())
        print(f"Unique values of {col}:\nNo. of Unique values: {unique_count}\n{value_counts}' \n")

In [5]:
def replace_values(df, cols, key_value, type):
    """
    Replace the values of a certain column to a desired value and set the column to the desired type
    
    Args:
        df (DataFrame): DataFrame containing columns you want to replace.
        cols (list): List of column names to loop through.
        key_value (dict): Dictionary contains the original value and new value {'original_value' : 'new_value'}
        type : The desired type of column type = np.int32
        
    Returns:
        None
        
    """
    # loop through each column and replace the key_value pair
    for col in cols:
        df[col] = df[col].replace(key_value).astype(type)

In [6]:
def split_values(df, cols= [], delimiter=','):
    """ 
    Split multiple values in a single string 
    to a list of single strings each represnting a single value. 

    Args:
    df (Dataframe): Dataframe contains desired columns
    delimiter (string): the delimiter to split on ex: ':' 

    Returns:
        None
    """
    #select only object columns
    cols = df[cols].select_dtypes(include='object').columns.tolist()
    
    # loop through object columns and convert to list on the decided delimiter
    for col in cols:
        df[col] = df[col].str.split(delimiter)

In [7]:
def combine_unique_values(df, cols_1, cols_2 , combined_cols):
    """
    Combine related Columns and return unique values
    
    Args:
    df (Dataframe): Dataframe contains desired columns
    cols_1, cols_2 (List): Columns need to be combined 
    combined_cols (list): The new combined Columns
    """
    # change the NAN values to [] to avoid errors
    df[cols_1 + cols_2] = df[cols_1 + cols_2].applymap(lambda x: x if isinstance(x, list) else [])
    
    #loop through the cols
    for i in range(len(cols_1)):
        # create a lambda function to combine each row and return unique values
        combine_lists_unique = lambda row: list(set(row[cols_1[i]] + row[cols_2[i]]))

        # implement the lambda function using apply method and return the new combined column
        df[combined_cols[i]] = df.apply(combine_lists_unique, axis=1)

______

In [8]:
# Load dataset and make a copy
raw_df = pd.read_csv(DATA_PATH)
df = raw_df.copy()

In [9]:
len(raw_df.columns)

79

### Remove Non-related Features

In [10]:
# Remove non_related Features
df = df[ROLE_COL + CORE_COLS + USEFUL_COLS]

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73268 entries, 0 to 73267
Data columns (total 34 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   DevType                       61302 non-null  object 
 1   VersionControlSystem          71379 non-null  object 
 2   LanguageHaveWorkedWith        70975 non-null  object 
 3   LanguageWantToWorkWith        67027 non-null  object 
 4   DatabaseHaveWorkedWith        60121 non-null  object 
 5   DatabaseWantToWorkWith        51014 non-null  object 
 6   PlatformHaveWorkedWith        49924 non-null  object 
 7   PlatformWantToWorkWith        40415 non-null  object 
 8   WebframeHaveWorkedWith        53544 non-null  object 
 9   WebframeWantToWorkWith        46122 non-null  object 
 10  MiscTechHaveWorkedWith        44992 non-null  object 
 11  MiscTechWantToWorkWith        36810 non-null  object 
 12  ToolsTechHaveWorkedWith       54171 non-null  object 
 13  T

### Check for Duplictes

In [12]:
# Check for duplicates
raw_df.duplicated().value_counts()

False    73268
dtype: int64

- **The Dataset contains no duplicate values.**

### Replace Values and Parse

In [13]:
# Replace Text Values in YearsCode and YearsCodePro to numerical and set type to float
cols = ['YearsCode','YearsCodePro']
REPLACE_YEARS_TEXT = {'Less than 1 year': 0, 'More than 50 years': 51}

replace_values(df, cols, REPLACE_YEARS_TEXT, np.float32)

In [14]:
# Verify your Results
for col in cols: 
    print(col)
    print(df[col].unique().tolist())
    print('--------------------------')
    print()

YearsCode
[nan, 14.0, 20.0, 8.0, 15.0, 3.0, 1.0, 6.0, 37.0, 5.0, 12.0, 22.0, 11.0, 4.0, 7.0, 13.0, 36.0, 2.0, 25.0, 10.0, 40.0, 16.0, 27.0, 24.0, 19.0, 9.0, 17.0, 18.0, 26.0, 51.0, 29.0, 30.0, 32.0, 0.0, 48.0, 45.0, 38.0, 39.0, 28.0, 23.0, 43.0, 21.0, 41.0, 35.0, 50.0, 33.0, 31.0, 34.0, 46.0, 44.0, 42.0, 47.0, 49.0]
--------------------------

YearsCodePro
[nan, 5.0, 17.0, 3.0, 6.0, 30.0, 2.0, 10.0, 15.0, 4.0, 22.0, 20.0, 40.0, 9.0, 14.0, 21.0, 7.0, 18.0, 25.0, 8.0, 12.0, 45.0, 1.0, 19.0, 28.0, 24.0, 11.0, 23.0, 0.0, 32.0, 27.0, 16.0, 44.0, 26.0, 37.0, 46.0, 13.0, 31.0, 39.0, 34.0, 38.0, 35.0, 29.0, 42.0, 36.0, 33.0, 43.0, 41.0, 48.0, 50.0, 51.0, 47.0, 49.0]
--------------------------



### Split Multiple Values Features

In [15]:
# Split multiple answers in object columns using ';' delimiter
cols = ROLE_COL + CORE_COLS
split_values(df, cols, delimiter=';')

In [16]:
# Verify your Results
i = df.sample(1).index[0]
print(raw_df['DevType'].iloc[i])
print(df['DevType'].iloc[i])

print(raw_df['LanguageHaveWorkedWith'].iloc[i])
print(df['LanguageHaveWorkedWith'].iloc[i])

Developer, desktop or enterprise applications
['Developer, desktop or enterprise applications']
Assembly;C;C#;C++;Delphi;SQL
['Assembly', 'C', 'C#', 'C++', 'Delphi', 'SQL']


### Combine Related Features

In [17]:
# Create two lists of columns that will be combined.
cols_1 = ['LanguageHaveWorkedWith',
          'DatabaseHaveWorkedWith',
          'PlatformHaveWorkedWith',
          'WebframeHaveWorkedWith',
          'MiscTechHaveWorkedWith',
          'ToolsTechHaveWorkedWith',                               
          'NEWCollabToolsHaveWorkedWith']

cols_2 = ['LanguageWantToWorkWith', 
          'DatabaseWantToWorkWith',  
          'PlatformWantToWorkWith',  
          'WebframeWantToWorkWith',  
          'MiscTechWantToWorkWith',  
          'ToolsTechWantToWorkWith',     
          'NEWCollabToolsWantToWorkWith']

# create a list containing the new names of the combined column.
combined_cols = ['Languages','Databases','Platforms','WebFrameworks','MiscTech','ToolsTech','CollabTools']

combine_unique_values(df,cols_1,cols_2, combined_cols)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73268 entries, 0 to 73267
Data columns (total 41 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   DevType                       61302 non-null  object 
 1   VersionControlSystem          71379 non-null  object 
 2   LanguageHaveWorkedWith        73268 non-null  object 
 3   LanguageWantToWorkWith        73268 non-null  object 
 4   DatabaseHaveWorkedWith        73268 non-null  object 
 5   DatabaseWantToWorkWith        73268 non-null  object 
 6   PlatformHaveWorkedWith        73268 non-null  object 
 7   PlatformWantToWorkWith        73268 non-null  object 
 8   WebframeHaveWorkedWith        73268 non-null  object 
 9   WebframeWantToWorkWith        73268 non-null  object 
 10  MiscTechHaveWorkedWith        73268 non-null  object 
 11  MiscTechWantToWorkWith        73268 non-null  object 
 12  ToolsTechHaveWorkedWith       73268 non-null  object 
 13  T

### Transform DevType column `labels`
1. Remove the rows with missing values `NAN`
2. One_hot encode the `DevType` column
3. Return every job name and its no. of appearances
3. identify non-related jobs `non_tech`
4. Remove the columns of non-related jobs
5. Remove rows where all tech_jobs = 0

In [19]:
# Drop the rows with missing values in DevType Column
df.dropna(subset=['DevType'], inplace= True)

- **11966 Rows Removed**

In [20]:
# Convert DevType column Using MultMultiLabelBinarizer instead of OneOneHotEncoder because we are dealing with iterable of iterables
# Also it is alot faster
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
#fit_Transform the DevType column
mlb_DevType = mlb.fit_transform(df['DevType'])


# temp_df1 = pd.get_dummies(temp_df.apply(pd.Series).stack(), prefix='DevType').groupby(level=0).max()

In [21]:
# Transform the dense matrix to a dataframe
df_mlb_DevType = pd.DataFrame(mlb_DevType, columns= mlb.classes_, index = df.index)

In [22]:
# merge the original dataset with the one_hot encoded DevType column
encoded_df = pd.concat([df_mlb_DevType,df],axis = 1)

In [23]:
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61302 entries, 2 to 73267
Data columns (total 70 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Academic researcher                            61302 non-null  int32  
 1   Blockchain                                     61302 non-null  int32  
 2   Cloud infrastructure engineer                  61302 non-null  int32  
 3   Data or business analyst                       61302 non-null  int32  
 4   Data scientist or machine learning specialist  61302 non-null  int32  
 5   Database administrator                         61302 non-null  int32  
 6   Designer                                       61302 non-null  int32  
 7   DevOps specialist                              61302 non-null  int32  
 8   Developer, QA or test                          61302 non-null  int32  
 9   Developer, back-end                            613

In [24]:
# Return every job name and its no. of appearances
dev_type_jobs = mlb.classes_.tolist()
encoded_df[dev_type_jobs].sum(axis = 0, numeric_only=True)

Academic researcher                               2709
Blockchain                                        1302
Cloud infrastructure engineer                     5283
Data or business analyst                          3201
Data scientist or machine learning specialist     3424
Database administrator                            4934
Designer                                          3764
DevOps specialist                                 6170
Developer, QA or test                             3096
Developer, back-end                              26595
Developer, desktop or enterprise applications     9546
Developer, embedded applications or devices       3923
Developer, front-end                             15915
Developer, full-stack                            28701
Developer, game or graphics                       1837
Developer, mobile                                 7634
Educator                                          2090
Engineer, data                                    3600
Engineer, 

- **Identify non-related jobs**
    - **`Designer`, `Educator`, `Marketing or sales professional`, `Other (please specify):`,
    `Engineer, site reliability`, `Engineering manager`, `Product manager`,
    `Project manager`, `Senior Executive (C-Suite, VP, etc.)`, `Student`**

In [25]:
non_related_jobs = [
    'Designer', 
    'Educator', 
    'Marketing or sales professional',
    'Other (please specify):',
    'Engineer, site reliability',
    'Engineering manager',
    'Product manager',
    'Project manager',
    'Senior Executive (C-Suite, VP, etc.)',
    'Student']

In [26]:
# Drop Columns of non_related jobs
encoded_df = encoded_df.drop(columns=non_related_jobs, axis = 1)

In [27]:
# Create a list of tech_jobs columns names
Tech_Jobs = list(set(dev_type_jobs) - set(non_related_jobs))

In [28]:
# Remove Rows where all Tech_Jobs = 0
encoded_df = encoded_df[~encoded_df[Tech_Jobs].eq(0).all(axis=1)]

### Transform the CORE_COLS columns `skills`
- Deal with missing values in `VersionControlSystem`
- One_hot encode the CORE_COLS columns
- Remove useless features in `VersionControlSystems`

In [40]:
# Change the NAN values to []
encoded_df['VersionControlSystem'] = encoded_df['VersionControlSystem'].apply(lambda x: x if isinstance(x, list) else [])

CORE_COLS = ['VersionControlSystem'] + combined_cols

In [42]:
# Convert the skills columns using multilabelbinarizer
prep_df = {}
prep_df['Techjobs'] = encoded_df[Tech_Jobs]
for col in CORE_COLS:
    binarizer = MultiLabelBinarizer()
    temp_df = pd.DataFrame(binarizer.fit_transform(encoded_df[col]),
                               columns=binarizer.classes_,
                               index=encoded_df[col].index)
    prep_df[col] = temp_df

In [43]:
prep_df = pd.concat(prep_df, axis=1)

In [46]:
# Remove 'I don't use one', `Other (please specify):` Columns
prep_df= prep_df.drop(["I don't use one","Other (please specify):"], axis=1, level=1)

In [47]:
prep_df

Unnamed: 0_level_0,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,VersionControlSystem,VersionControlSystem,VersionControlSystem,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools
Unnamed: 0_level_1,Data scientist or machine learning specialist,"Engineer, data",Data or business analyst,"Developer, back-end",Database administrator,"Developer, mobile","Developer, full-stack",Cloud infrastructure engineer,"Developer, embedded applications or devices","Developer, QA or test",System administrator,Scientist,Security professional,"Developer, game or graphics","Developer, front-end",Blockchain,"Developer, desktop or enterprise applications",DevOps specialist,Academic researcher,Git,Mercurial,SVN,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,Delphi,Elixir,Erlang,F#,Fortran,Go,Groovy,HTML/CSS,Haskell,Java,JavaScript,Julia,Kotlin,LISP,Lua,MATLAB,OCaml,Objective-C,PHP,Perl,PowerShell,Python,R,Ruby,Rust,SAS,SQL,Scala,Solidity,Swift,TypeScript,VBA,Cassandra,Cloud Firestore,CouchDB,Couchbase,DynamoDB,Elasticsearch,Firebase Realtime Database,IBM DB2,MariaDB,Microsoft SQL Server,MongoDB,MySQL,Neo4j,Oracle,PostgreSQL,Redis,SQLite,AWS,Colocation,DigitalOcean,Firebase,Google Cloud,Heroku,IBM Cloud or Watson,Linode,Managed Hosting,Microsoft Azure,OVH,OpenStack,Oracle Cloud Infrastructure,VMware,ASP.NET,ASP.NET Core,Angular,Angular.js,Blazor,Deno,Django,Drupal,Express,FastAPI,Fastify,Flask,Gatsby,Laravel,Next.js,Node.js,Nuxt.js,Phoenix,Play Framework,React.js,Ruby on Rails,Svelte,Symfony,Vue.js,jQuery,.NET,Apache Kafka,Apache Spark,Capacitor,Cordova,Electron,Flutter,GTK,Hadoop,Hugging Face Transformers,Ionic,Keras,NumPy,Pandas,Qt,React Native,Scikit-learn,Spring,TensorFlow,Tidyverse,Torch/PyTorch,Uno Platform,Xamarin,Ansible,Chef,Docker,Flow,Homebrew,Kubernetes,Pulumi,Puppet,Terraform,Unity 3D,Unreal Engine,Yarn,npm,Android Studio,Atom,CLion,Eclipse,Emacs,GoLand,IPython/Jupyter,IntelliJ,Nano,Neovim,NetBeans,Notepad++,PhpStorm,PyCharm,Qt Creator,"RAD Studio (Delphi, C++ Builder)",RStudio,Rider,RubyMine,Spyder,Sublime Text,TextMate,Vim,Visual Studio,Visual Studio Code,Webstorm,Xcode
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
4,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1
7,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,1,0,1,1,1,1,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0
8,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73263,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0
73264,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0
73265,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,1,1,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0
73266,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0


### Export Data

In [48]:
# Export the data with pickle
prep_df.to_pickle(EXPORT_PATH)