# Data Preprocessing
This Notebook processes the Data from Danmarks Statistics and DTU's TU Data set.
We have 5 different data sets that are cleaned, categorised and eventually merged into one data frame.

In [1]:
### Basic module import
import os           # Working directory
import pandas as pd # Data processing
import numpy as np  # Scientific computing/matrix algebra
import matplotlib.pyplot as plt # Common graphing interface (check also plotly and plotnine)

In [2]:
### Support functions

def count_nan_values(dataframe):
    nan_count_df = pd.DataFrame([(column, dataframe[column].isna().sum()) for column in dataframe.columns], columns=['Column', 'NaN Count'])
    nan_count_df = nan_count_df.loc[(nan_count_df['NaN Count'] > 0)].sort_values('NaN Count')

    with pd.option_context('display.max_rows', None):
        print(nan_count_df.reset_index(inplace=False, drop=True))


def process_data_frame(file_name, categorical_cols = [], numerical_cols_float = [], numerical_cols_int = [], character_cols =[]):
    # Read CSV file
    df = pd.read_csv(file_name, sep=',', engine='python')

    # Set categorical columns
    df[categorical_cols] = df[categorical_cols].astype('category')

    # Set numerical columns of float type
    df[numerical_cols_float] = df[numerical_cols_float].replace(
        np.nan, -1).astype('float64')

    # Set numerical columns of integer type
    df[numerical_cols_int] = df[numerical_cols_int].replace(
        np.nan, -1).astype('int64')

    # Set character columns
    df[character_cols] = df[character_cols].astype('category')

    # Drop unnamed columns
    df.drop(df.columns[df.columns.str.contains(
        'unnamed', case=False)], axis=1, inplace=True)

    return df


## Reading the data

1. job.csv
2. pop.csv
3. commuter_codes.csv
4. commuter_values.csv
5. session.csv

#### Define data location

Due to the fact that we are handleing sensible data, we need to ensure that any data stays on DTU's servers. 

In [3]:
os.getcwd()

'/Users/luis/MScPoPSyn/PopSyn/Data_Processing'

In [4]:
# %cd ~/snap/snapd-desktop-integration/83/Documents/Thesis/data

os.chdir('/Users/luis/Desktop/Data_extracted/')

### Job Data

- Load 'job.csv'
- Set attribute types to numerical and categorical values
- extract *job municipalities*

In [5]:
# Defining the 'categorical' and 'numerical' lists and changing the datatypes accordingly
job_categorical = ['Municipality', 'AgeGroup', 'Gender', 'Sector', 'Socio']
# job_float = []
job_int = ['Val', 'Year']
# job_character = []

job_df = process_data_frame(
    'job.csv', categorical_cols=job_categorical, numerical_cols_int=job_int)

# Renaming categories from Maend and Kvinder (Man and Women) to 1 and 2.
job_df['Gender'] = job_df.Gender.cat.rename_categories({'Men': '1', 'Women': '2'})


# Saving this set for further investigation
job_mun = set(job_df['Municipality'])

In [6]:
job_df.head()

Unnamed: 0,Year,Gender,AgeGroup,Socio,Sector,Municipality,Val
0,2017,2,45-49 years,Self-employed,CM Manufacture of funiture and other manufactu...,Odder,1
1,2017,1,45-49 years,Self-employed,CM Manufacture of funiture and other manufactu...,Odder,3
2,2017,1,50-54 years,Self-employed,CM Manufacture of funiture and other manufactu...,Odder,2
3,2017,1,55-59 years,Self-employed,CM Manufacture of funiture and other manufactu...,Odder,1
4,2017,1,67 years and over,Self-employed,CM Manufacture of funiture and other manufactu...,Odder,1


### Commuter Data

- load 'commuter_codes.csv' and 'commuter_values.csv'
- Set attribute types to numerical and categorical values
- extract *workplace names*

In [7]:
cm_categorical = ['Gender', 'Residence', 'Work']
# cm_float = []
cm_int = ['Val', 'Year'] 
# cm_character = []

cm_df = process_data_frame('commuter_codes.csv', categorical_cols = cm_categorical, numerical_cols_int = cm_int, )
cm_df_val = process_data_frame('commuter_values.csv',  categorical_cols = cm_categorical, numerical_cols_int = cm_int, )


# Renaming categories from Maend and Kvinder (Man and Women) to 1 and 2.
cm_df['Gender'] = cm_df.Gender.cat.rename_categories({'M': '1', 'K': '2'})
cm_df_val['Gender'] = cm_df_val.Gender.cat.rename_categories({'Men': '1', 'Women': '2'})


### Merge the data frames and create commuter database

# Standard merge is based on set index.
cm_df_tot = pd.merge(cm_df, cm_df_val, left_index=True,
                     right_index=True, suffixes=('_c', '_v'))

# Convert the residence code for further work
cm_df_tot['Residence_c'] = cm_df_tot['Residence_c'].astype('int64')
cm_df_tot['Work_c'] = cm_df_tot['Work_c'].astype('int64')

# Create workplace and residence code sets
workplace_codes = cm_df_tot[['Work_c', 'Work_v']].drop_duplicates()
residence_codes = cm_df_tot[['Residence_c', 'Residence_v']].drop_duplicates()

In [8]:
cm_df_tot.head()

Unnamed: 0,Year_c,Gender_c,Residence_c,Work_c,Val_c,Year_v,Gender_v,Residence_v,Work_v,Val_v
0,2018,1,270,2,242,2018,1,Gribskov,Province Københavns omegn,242
1,2018,2,306,2,42,2018,2,Odsherred,Province Københavns omegn,42
2,2018,1,306,2,93,2018,1,Odsherred,Province Københavns omegn,93
3,2018,2,316,2,260,2018,2,Holbæk,Province Københavns omegn,260
4,2018,1,316,2,390,2018,1,Holbæk,Province Københavns omegn,390


### Population Data
- load 'pop.csv'
- Set attribute types to numerical and categorical values
- extract job municipalities

In [9]:
pop_categorical = ['Municipality', 'PopSocio', 'Sector', 'AgeGroup', 'Gender', 'edu']
# pop_float = []
pop_int = ['Year', 'Val']
# pop_character = []

pop_df = process_data_frame(
    'pop.csv', categorical_cols = pop_categorical, numerical_cols_int = pop_int)

# Renaming categories from Maend and Kvinder (Man and Women) to 1 and 2.
pop_df['Gender'] = pop_df.Gender.cat.rename_categories({'Men': '1', 'Women': '2'});

In [10]:
pop_df.head()

Unnamed: 0,Year,Gender,AgeGroup,Sector,PopSocio,edu,Municipality,Val
0,2015,2,50-54 years,Manufacture of machinery,Employed,H70 Masters programs,Region Nordjylland,3
1,2015,1,50-54 years,Manufacture of machinery,Employed,H70 Masters programs,Region Nordjylland,20
2,2015,2,55-59 years,Manufacture of machinery,Employed,H70 Masters programs,Region Nordjylland,2
3,2015,1,55-59 years,Manufacture of machinery,Employed,H70 Masters programs,Region Nordjylland,12
4,2015,1,60-64 years,Manufacture of machinery,Employed,H70 Masters programs,Region Nordjylland,3


In [11]:

pop_df = pd.merge(pop_df, workplace_codes, left_on='Municipality', right_on='Work_v')
pop_df[['edu_c', 'edu_v']] = pd.DataFrame(pop_df['edu'].str.split(" ", n=1, expand=True).astype('category'))
pop_df['PopSocio_c'] = pop_df.PopSocio.cat.rename_categories({'Enrolled in education': '0', 'Employed': '1', 'Unemployed':'2', 'Outside the labour force':'3'})


In [12]:
pop_df.head()

Unnamed: 0,Year,Gender,AgeGroup,Sector,PopSocio,edu,Municipality,Val,Work_c,Work_v,edu_c,edu_v,PopSocio_c
0,2015,2,50-54 years,Manufacture of machinery,Employed,H70 Masters programs,Region Nordjylland,3,81,Region Nordjylland,H70,Masters programs,1
1,2015,1,50-54 years,Manufacture of machinery,Employed,H70 Masters programs,Region Nordjylland,20,81,Region Nordjylland,H70,Masters programs,1
2,2015,2,55-59 years,Manufacture of machinery,Employed,H70 Masters programs,Region Nordjylland,2,81,Region Nordjylland,H70,Masters programs,1
3,2015,1,55-59 years,Manufacture of machinery,Employed,H70 Masters programs,Region Nordjylland,12,81,Region Nordjylland,H70,Masters programs,1
4,2015,1,60-64 years,Manufacture of machinery,Employed,H70 Masters programs,Region Nordjylland,3,81,Region Nordjylland,H70,Masters programs,1


### TU Data
- load 'session.csv', 'bil.csv' and 'household.csv'
- Set attribute types to numerical and categorical values


In [13]:
session_categorical = ['DayPrimTargetMuncode', 'DayPrimTargetPurp', 'DayStartCityCode', 'DayStartJourneyRole', 'DayStartMuncode', 'DayStartPurp', 
               'DiaryDaytype', 'DiaryMonth', 'DiaryWeekday', 'Handicap', 'HomeAdrCityCode', 'HomeAdrMunCode', 'HomeParkPoss', 
               'HousehAccomodation', 'HousehAccOwnorRent', 'HwDaysReason', 'InterviewType', 'JstartMuncode', 'JstartNUTS', 
               'JstartType', 'ModeChainTypeDay', 'NuclFamType', 'PosInFamily', 'PrimModeDay', 'PrimOccMuncode', 'RespEdulevel', 
               'RespHasBicycle', 'ResphasDrivlic', 'RespHasRejsekort', 'RespHasSeasonticket', 'RespIsmemCarshare', 'RespNotripReason', 
               'RespPrimOcc', 'RespSex', 'SduMuncode', 'WorkHourType', 'WorkParkPoss', 'WorkPubPriv','DayStartFareZone', 'DayStartGMMzone', 'HomeAdrFareZone', 
               'HomeAdrGMMzone', 'JstartFareZone', 'JstartGMMzone', 'PrimOccFareZone', 'PrimOccGMMzone', 'SduGMMzone', 'HwDayspW', 'DayJourneyType']

session_float = ['DayNumJourneys', 'GISdistHW', 'HomeAdrDistNearestStation', 'JstartDistNearestStation', 'SessionWeight', 'TotalBicLen', 
                   'TotalFuelConsumpMJ', 'TotalGramCO2', 'TotalGramCO2eq', 'TotalLenExclComTrans', 'WeightOver6']

session_int = ['DiaryDate', 'DiaryYear', 'FamNumAdults', 'FamNumDrivLic', 'FamNumPers', 'FamNumPers1084', 'FamNumPersO6', 
                 'HomeAdrCitySize', 'HousehCarOwnership', 'HousehNumAdults', 'HousehNumcars', 'HousehNumDrivLic', 'HousehNumPers', 
                 'HousehNumPers1084', 'HousehNumPersO6',  'IncFamily', 'IncFamily2000', 'IncHouseh', 'IncHouseh2000', 
                 'IncNuclFamily', 'IncNuclFamily2000', 'IncRespondent', 'IncRespondent2000', 'IncSpouse', 'IncSpouse2000', 'kmarbud', 
                 'NightsAway', 'NuclFamNumAdults', 'NuclFamNumDrivLic', 'NuclFamNumPers', 'NuclFamNumPers1084', 'NuclFamNumPersO6', 
                 'NumTripsCorr', 'NumTripsExclComTrans', 'RespAgeCorrect', 'RespAgeSimple', 'RespDrivlicYear', 'RespYearBorn', 
                 'SessionId', 'TotalLen', 'TotalMin', 'TotalMinExclComTrans', 'TotalMotorLen', 'TotalMotorMin', 'TotalNumTrips', 
                 'WorkatHomeDayspM', 'WorkHoursPw']

session_characters = ['DayStartNUTS', 'HomeAdrNearestStation', 'HomeAdrNUTS', 'JstartNearestStation', 'PrimOccNUTS', 'PseudoYear', 'SduNUTS']


session_df = process_data_frame('session.csv', session_categorical, session_float, session_int, session_characters)




In [14]:
session_df.head()

Unnamed: 0,SessionId,InterviewType,DiaryDate,DiaryYear,PseudoYear,DiaryMonth,DiaryWeekday,DiaryDaytype,HomeAdrNUTS,HomeAdrMunCode,...,JstartMuncode,JstartGMMzone,JstartFareZone,JstartNearestStation,JstartDistNearestStation,DayJourneyType,DayPrimTargetMuncode,DayPrimTargetPurp,SessionWeight,WeightOver6
0,50023,0,13280,2006,2006/7,5,5,33,DK042,751,...,751.0,751525.0,,Torsøvej,1.0,1,751.0,1.0,259.315137,-1.0
1,50026,0,13280,2006,2006/7,5,5,33,DK041,657,...,657.0,657133.0,,Herning,1.4,11,779.0,41.0,1106.34382,-1.0
2,50027,0,13280,2006,2006/7,5,5,33,DK041,779,...,779.0,779154.0,,Skive,3.3,11,779.0,31.0,845.286787,-1.0
3,50028,0,13280,2006,2006/7,5,5,33,DK032,540,...,540.0,540131.0,,Sønderborg,7.7,11,540.0,41.0,759.260946,-1.0
4,50029,0,13280,2006,2006/7,5,5,33,DK031,410,...,410.0,410015.0,,Nørre Åby,2.6,11,410.0,41.0,864.993135,-1.0


### Household Cars table


In [15]:
bil_categorical = ['FuelType', 'NplateColour','CarOwnershipType']
# bil_float = []
bil_int = ['SessionId', 'bilnr', 'CarModelYear']
# bil_character = []


bil_df = process_data_frame('bil.csv', bil_categorical, numerical_cols_int = bil_int)


def groupby_latest_model_year(x):
    latest_indices = x['CarModelYear'].idxmax()  # Find the index with the latest CarModelYear
    latest_rows = x.loc[latest_indices]  # Get the rows corresponding to the latest indices
    return tuple(latest_rows[col] for col in ['CarModelYear', 'FuelType'])



# Group by the non-unique IDs and apply the sampling function
bil_df_sampled = bil_df.groupby('SessionId').apply(groupby_latest_model_year).apply(pd.Series)
bil_df_sampled.columns = ['CarModelYear', 'FuelType']

session_df = pd.merge(session_df, bil_df_sampled, on='SessionId', how='left')
session_df['CarModelYear'] = session_df['CarModelYear'].replace(np.nan, -1).astype('int64')
session_df['FuelType'] = session_df['FuelType'].astype('category')



In [16]:
bil_df_sampled.head()

Unnamed: 0_level_0,CarModelYear,FuelType
SessionId,Unnamed: 1_level_1,Unnamed: 2_level_1
50023,1998.0,
50026,1995.0,
50027,2006.0,
50028,1992.0,
50029,1992.0,


### Household Members table

In [17]:
household_categorical = ['Relation', 'PosInFamily', 'Gender', 'HasDrivLic']
# household_float = []
household_int = ['SessionId', 'medlnr', 'YearBorn', 'AgeSimple']
# household_character = []

household_df = process_data_frame('household.csv', categorical_cols = household_categorical,  numerical_cols_int = household_int)

# Count people between 4 and 15
count_4_to_15 = household_df.query('0 <= AgeSimple <= 15').groupby('SessionId').size().reset_index(name='KidsBetween0and15')

# Count people between 0 and 4
count_0_to_4 = household_df.query('0 <= AgeSimple <= 4').groupby('SessionId').size().reset_index(name='KidsBetween0and4')

# Merge counts and fill NaN values with 0
household_df_children = count_4_to_15.merge(count_0_to_4, on='SessionId', how='outer').fillna(0)

session_df = pd.merge(session_df, household_df_children,on='SessionId', how='left')
session_df[['KidsBetween0and15', 'KidsBetween0and4']] = session_df[['KidsBetween0and15', 'KidsBetween0and4']].replace(np.nan, 0).astype('int64')


In [18]:
tur_categorical = ['OrigNUTS', 'DestNUTS']
tur_float = ['TurId', 'TurNr', 'TripCount', 'DepartHH', 'DepartMM', 'DepartMSM', 
              'ArrivalHH', 'ArrivalMM', 'ArrivalMSM', 'DestDwelTime', 'OrigMuncode', 'OrigCityCode', 
              'OrigGMMzone', 'OrigFareZone', 'OrigDistNearestStation',  'DestCityCode', 
              'DestFareZone', 'DestDistNearestStation', 'OrigPurp', 'DestPurp', 'DestEscortPurp', 
              'ShopAmount', 'TripPurp', 'TripPurpGroup', 'SimplWorktour', 'SimplWorkNumstop', 'GISdist', 
              'NumModes', 'SumLen', 'SumMin', 'SumMotorLen', 'SumMotorMin', 'SumMJ', 'SumCO2', 'SumCO2eq', 
              'ModeChainType', 'PrimMode', 'PrimModeDrivPass', 'SecMode', 'PrimModeSumlen', 'SecModeSumlen', 
              'FirstMode', 'LastMode', 'PartyorAlone', 'PartyNumu10', 'PartyNum1017', 'PartyNumAdults', 
              'BicType', 'CarPassDriver', 'CarPassContext', 'CarCostShare', 'CarUsageCarNo', 'PtTicketType', 
              'PtPrice', 'PtBicType', 'PtPrimMode', 'PtNumBoardings', 'PtAccTime', 'PtFirstWaitTime', 
              'PtInvTime', 'PtChangeAndWaitTime', 'PtEgrTime', 'PtAccMode', 'PtEgrMode', 'PtAccLen', 'PtEgrLen', 
              'TrainMode', 'TrainAccMode', 'TrainEgrMode', 'TrainAccMin', 'TrainEgrMin', 'TrainAccLen', 
              'TrainEgrLen', 'TrainAccDist', 'TrainEgrDist', 'JourneyId', 'JourneyRole', 'GISdistJourneyStartP']
tur_int = ['SessionId', 'DestGMMzone', 'DestMuncode',]
tur_character = ['OrigNearestStation','DestNearestStation', 'FirstStation', 'LastStation']

tur_df = process_data_frame('tur.csv', tur_categorical, tur_float, tur_int, tur_character)
                  

In [19]:
### Merge Car and Household data to TU Data
session_df[['SessionId', 'HousehNumcars', 'HousehCarOwnership', 'CarModelYear', 'FuelType', 'IncFamily2000', 'IncRespondent2000', 'FamNumPers', 'FamNumAdults', 'FamNumPers1084','FamNumPersO6','KidsBetween0and15','KidsBetween0and4']].head()

Unnamed: 0,SessionId,HousehNumcars,HousehCarOwnership,CarModelYear,FuelType,IncFamily2000,IncRespondent2000,FamNumPers,FamNumAdults,FamNumPers1084,FamNumPersO6,KidsBetween0and15,KidsBetween0and4
0,50023,1,1,1998,,622,0,5,2,5,5,2,0
1,50026,1,1,1995,,350,136,2,2,2,2,0,0
2,50027,2,2,2006,,558,297,2,2,2,2,0,0
3,50028,2,2,1992,,475,200,4,3,4,4,0,0
4,50029,1,1,1992,,400,0,4,2,4,4,1,0


## Denmark Satatistics Data cleansing

- cleaning Municipality codes due to aggregation and total values.
- [Regions values can be dropped - Codes 082-085]
- [Provincesvalues can be dropped - Codes 1-11]
- [Outside Denmark can be dropped - Code 950]
- [All Denmark can be dropped - Code ]

In [20]:
print('Job Municipalities: ',job_df.Municipality.nunique())
print('PoP Municipalities: ',pop_df.Municipality.nunique())
print('Commuter Municipalities: ',cm_df_tot.Work_v.nunique())
print('TU Workplaces: ', session_df.PrimOccMuncode.nunique())
print('TU HomeAddress: ', session_df.HomeAdrMunCode.nunique())

Job Municipalities:  116
PoP Municipalities:  104
Commuter Municipalities:  115
TU Workplaces:  101
TU HomeAddress:  99


In [21]:
# List of values to be dropped for each category
regions_codes = ['82', '83', '84', '85']
provinces_codes = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']
outside_denmark_code = '950'
all_denmark_code = 'X'

In [22]:
occ_cat = set(list(session_df.PrimOccMuncode.cat.categories))
hom_cat = set(list(session_df.HomeAdrMunCode.cat.categories))

print(f"Number of unique values in 'PrimOccMuncode': {occ_cat}")
print(f"Number of unique values in 'HomeAdrMunCode': {hom_cat}")

# Calculate and print differences between 'Work_c' and 'PrimOccMuncode' columns

diff_work = list(set(workplace_codes.Work_c) ^ occ_cat)
diff_work = [int(i) for i in diff_work]
print("Differences between 'Work_c' and 'PrimOccMunCode' columns:")
print([int(i) for i in diff_work])

work = workplace_codes.loc[~workplace_codes['Work_c'].isin(diff_work)]


# Calculate and print differences between 'Residence_c' and 'HomeAdrMunCode' columns

diff_residence = list(set(residence_codes.Residence_c) ^ hom_cat)
diff_residence = [int(i) for i in diff_residence]
print("Differences between 'Residence_c' and 'HomeAdrMunCode' columns:")
print([int(i) for i in diff_residence])

residence = residence_codes.loc[~residence_codes['Residence_c'].isin(diff_residence)]

Number of unique values in 'PrimOccMuncode': {530.0, 540.0, 550.0, 561.0, 563.0, 573.0, 575.0, 580.0, 607.0, 101.0, 615.0, 621.0, 630.0, 657.0, 147.0, 661.0, 151.0, 153.0, 665.0, 155.0, 157.0, 159.0, 671.0, 161.0, 163.0, 165.0, 167.0, 169.0, 173.0, 175.0, 183.0, 185.0, 187.0, 190.0, 706.0, 707.0, 710.0, 201.0, 210.0, 727.0, 217.0, 730.0, 219.0, 223.0, 740.0, 741.0, 230.0, 746.0, 751.0, 240.0, 756.0, 760.0, 250.0, 253.0, 766.0, 259.0, 260.0, 773.0, 265.0, 779.0, 269.0, 270.0, 787.0, 791.0, 810.0, 813.0, 306.0, 820.0, 825.0, 316.0, 320.0, 326.0, 840.0, 329.0, 330.0, 846.0, 336.0, 849.0, 851.0, 340.0, 860.0, 350.0, 360.0, 370.0, 376.0, 390.0, 400.0, 410.0, 411.0, 420.0, 430.0, 440.0, 450.0, 461.0, 479.0, 480.0, 482.0, 997.0, 999.0, 492.0, 510.0}
Number of unique values in 'HomeAdrMunCode': {530, 540, 550, 561, 563, 573, 575, 580, 607, 101, 615, 621, 630, 657, 147, 661, 151, 153, 665, 155, 157, 159, 671, 161, 163, 165, 167, 169, 173, 175, 183, 185, 187, 190, 706, 707, 710, 201, 210, 727, 2

In [23]:
session_df = session_df.drop(session_df[session_df.PrimOccMuncode.isin(diff_work) | session_df.HomeAdrMunCode.isin(diff_residence)].index)

## Check column values to prepare merge with pop, job and commute data



### Define functions

In [24]:
def count_and_drop_rows(df, column_name, condition_value):
    # Store the count of rows before the drop operation
    rows_before = len(df)

    # Drop rows based on the condition
    df.drop(df[df[column_name] == condition_value].index, inplace = True)

    # Calculate the count of rows after the drop operation
    rows_after = len(df)

    # Calculate how many rows have been dropped
    rows_dropped = rows_before - rows_after

    # Print the number of rows dropped
    print(f"{rows_dropped} rows have been dropped.")

    return df

In [25]:
def df_column_info(df, exclude_columns=None):
    for column in df.columns:
        if exclude_columns and column in exclude_columns:
            continue

        unique_values = sorted(df[column].unique().tolist())
        num_unique = len(unique_values)
        
        print(f"\nColumn: {column}")
        print(f"Number of Unique Values: {num_unique}")
        print(f"Unique Values: {unique_values}")

In [26]:
def remove_unused_categories(df, column_name):
    # Get the current categories
    current_categories = set(df[column_name].cat.categories)

    # Remove unused categories from the specified categorical column
    df[column_name].cat.remove_unused_categories()

    # Get the removed categories
    removed_categories = current_categories - set(df[column_name].cat.categories)

    # Print the removed categories and their count
    for category in removed_categories:
        print(f"Removed category '{category}'")

In [27]:
def add_new_categories(df, column_name, new_categories):
    for new_category in new_categories:
        if new_category not in df[column_name].cat.categories:
            df[column_name] = df[column_name].cat.add_categories(new_category)


### Pop Data

In [28]:
df_column_info(pop_df, 'Val')



Column: Year
Number of Unique Values: 14
Unique Values: [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]



Column: Gender
Number of Unique Values: 2
Unique Values: ['1', '2']

Column: AgeGroup
Number of Unique Values: 11
Unique Values: ['15-19 years', '20-24 years', '25-29 years', '30-34 years', '35-39 years', '40-44 years', '45-49 years', '50-54 years', '55-59 years', '60-64 years', '65-69 years']

Column: Sector
Number of Unique Values: 37
Unique Values: ['Accommodation and food service activities', 'Activity not stated', 'Advertising and other business services', 'Agriculture, forestry and fishing', 'Arts, entertainment and recreation activities', 'Basic metals and fabricated metal products', 'Construction', 'Consultancy etc.', 'Education', 'Electrical equipment', 'Electricity, gas, steam and air conditioning supply', 'Financial and insurance', 'Human health activities', 'IT and information service activities', 'Manufacture of chemicals', 'Manufacture of electronic components', 'Manufacture of food products, beverages and tobacco', 'Manufacture of funiture and other manufacturing', 'Man

In [29]:
# Get the unique values in the 'edu' column, convert them to a list
unique_edu_values = set(pop_df['edu'].unique().tolist())


pop_df['edu_c'] = pop_df['edu_c'].cat.add_categories('H99')
pop_df['edu_v']=pop_df['edu_v'].cat.add_categories('Long-term further education')
pop_df['edu']=pop_df['edu'].cat.add_categories('H99 Long-term further education')

# Update values in the 'edu_c' column where 'H70' or 'H80' is replaced with 'H99'
pop_df.loc[(pop_df['edu_c'] == 'H70') | (pop_df['edu_c'] == 'H80'), 'edu_c'] == 'H99'
pop_df.loc[(pop_df['edu'] == 'H70 Masters programs') | (pop_df['edu'] == 'H80 PhD programs'), 'edu'] = 'H99 Long-term further education'
pop_df.loc[(pop_df['edu_v'] == 'Masters programs') | (pop_df['edu_v'] == 'PhD programs'), 'edu_v'] = 'Long-term further education'

# Count and drop rows where 'edu' is 'H90 Not stated'
pop_df = count_and_drop_rows(pop_df, 'edu', 'H90 Not stated')

# Set variables as categories
pop_df[['PopSocio_c', 'edu', 'edu_c', 'edu_v']] = pop_df[['PopSocio_c', 'edu', 'edu_c', 'edu_v']].astype('category')

# Renaming categories from Maend and Kvinder (Man and Women) to 1 and 2.
pop_df['Gender'] = pop_df['Gender'].cat.rename_categories({'Men': '1', 'Women': '2'});

274701 rows have been dropped.


In [30]:
pop_df.head()

Unnamed: 0,Year,Gender,AgeGroup,Sector,PopSocio,edu,Municipality,Val,Work_c,Work_v,edu_c,edu_v,PopSocio_c
0,2015,2,50-54 years,Manufacture of machinery,Employed,H99 Long-term further education,Region Nordjylland,3,81,Region Nordjylland,H70,Long-term further education,1
1,2015,1,50-54 years,Manufacture of machinery,Employed,H99 Long-term further education,Region Nordjylland,20,81,Region Nordjylland,H70,Long-term further education,1
2,2015,2,55-59 years,Manufacture of machinery,Employed,H99 Long-term further education,Region Nordjylland,2,81,Region Nordjylland,H70,Long-term further education,1
3,2015,1,55-59 years,Manufacture of machinery,Employed,H99 Long-term further education,Region Nordjylland,12,81,Region Nordjylland,H70,Long-term further education,1
4,2015,1,60-64 years,Manufacture of machinery,Employed,H99 Long-term further education,Region Nordjylland,3,81,Region Nordjylland,H70,Long-term further education,1


### Job Data

In [31]:
# df_column_info(job_df, 'Val')

### Commuter Data

In [32]:
# df_column_info(cm_df_tot, ['value_c', 'value_v'])

### TU Data


In [33]:
### We are droppping the NaN values in the following variables ['RespSex', 'RespEdulevel', 'RespPrimOcc', 'HomeAdrMunCode'].
### The purpose is to have a clean dataset base line for the analysis. The set of values represents the common ground for all the datasets.

# List of variables to check for NaN values
check_var = ['RespSex', 'RespEdulevel', 'RespPrimOcc', 'HomeAdrMunCode']

# Iterate through each variable
for var in check_var:
    print(f"\nProcessing variable: {var}")

    # Unique values before removing missing values
    unique_before = session_df[var].unique()
    count_before = len(session_df)

    # Drop rows with missing values in the current variable
    session_df.drop(session_df[session_df[var].isnull()].index, inplace=True)
    print(f"Number of dropped rows: {count_before - len(session_df)}")

    # Unique values after removing missing values
    unique_after = session_df[var].unique()

    # If unique values have decreased, print the unique values
    if len(unique_before) >= len(unique_after):
        print(f"Unique values of '{var}' after removing missing values:")
        print(f"Number of Values ",len(session_df[var].unique()), session_df[var].unique().sort_values().tolist(), )
        print(f"Values dropped: {set(unique_before) ^ set(unique_after)}")
    else:
        print("no values dropped")



Processing variable: RespSex
Number of dropped rows: 0
Unique values of 'RespSex' after removing missing values:
Number of Values  2 [1, 2]
Values dropped: set()

Processing variable: RespEdulevel
Number of dropped rows: 414
Unique values of 'RespEdulevel' after removing missing values:
Number of Values  11 [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 9.0, 11.0, 12.0, 13.0, 14.0]
Values dropped: {nan}

Processing variable: RespPrimOcc
Number of dropped rows: 1392
Unique values of 'RespPrimOcc' after removing missing values:
Number of Values  19 [10.0, 103.0, 107.0, 116.0, 120.0, 130.0, 210.0, 211.0, 221.0, 222.0, 231.0, 232.0, 233.0, 310.0, 320.0, 350.0, 360.0, 370.0, 390.0]
Values dropped: {nan}

Processing variable: HomeAdrMunCode
Number of dropped rows: 0
Unique values of 'HomeAdrMunCode' after removing missing values:
Number of Values  99 [101, 147, 151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 173, 175, 183, 185, 187, 190, 201, 210, 217, 219, 223, 230, 240, 250, 253, 259, 260, 265, 269, 27

In [34]:
###
### Defining Age Groups
###

# Define age bins and corresponding categories
age_bins = [-1, 15, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 200]
age_cats = ['under 15 years', '15-19 years', '20-24 years', '25-29 years', '30-34 years', '35-39 years',
            '40-44 years', '45-49 years', '50-54 years', '55-59 years', '60-64 years', '65-69 years', 'over 69 years']

# Create a categorical variable 'AgeGroup' based on 'RespAgeCorrect' using the specified bins and labels
session_df['AgeGroup'] = pd.cut(session_df.RespAgeCorrect, age_bins, labels=age_cats)

###
### Modifying the Education variable
###

session_df['Education'] = ''
session_df.loc[session_df['RespEdulevel'].isin([1., 2., 3., 4.]), 'Education'] = 'H10' ### 'H10 Primary education','H90 Not stated'
session_df.loc[session_df['RespEdulevel'] == 5., 'Education'] = 'H20' ### 'H20 Upper secondary education',
session_df.loc[session_df['RespEdulevel'] == 6., 'Education'] = 'H50' ### 'H50 Vocational bachelors educations',
session_df.loc[session_df['RespEdulevel'] == 9., 'Education'] = 'H35' ### 'H35 Qualifying educational programs',
session_df.loc[session_df['RespEdulevel'] == 11., 'Education'] = 'H30' ### 'H30 Vocational Education and Training (VET)',
session_df.loc[session_df['RespEdulevel'] == 12., 'Education'] = 'H40' ### 'H40 Short cycle higher education',
session_df.loc[session_df['RespEdulevel'] == 13., 'Education'] = 'H60' ### 'H60 Bachelors programs',
session_df.loc[session_df['RespEdulevel'] == 14., 'Education'] = 'H99' ### 'H70 Masters programs', 'H80 PhD programs',

session_df['Education'] = session_df['Education'].astype('category')

###
### Modifying the Population Socio variable
###
session_df['PopSocio'] = ''

# Define mappings for occupation codes to PopSocio values
occupation_mappings = {
    '0': [103., 107., 116., 120., 130.], ### 'Enrolled in education': '0',
    '1': [211., 210., 221., 231., 232., 233., 222.], ### 'Employed': '1',
    '2': [310.], ### 'Unemployed':'2',
    '3': [320., 390., 360., 370., 350.], ### 'Outside the labour force':'3'
}

# Use a loop to update 'PopSocio' based on occupation codes
for pop_socio, occupation_codes in occupation_mappings.items():
    session_df.loc[session_df['RespPrimOcc'].isin(occupation_codes), 'PopSocio'] = pop_socio

session_df['PopSocio'] = session_df['PopSocio'].astype('category')

## Merge

In [35]:
### Prepare the dataframes to be merged

### Pop dataframe

pop_df_merge = pop_df[['Year', 'Gender', 'AgeGroup', 'Sector', 'Val', 'Work_c', 'edu_c', 'PopSocio_c']].copy()
pop_rename = {
    'Work_c':'MunicipalityOrigin',
    'edu_c':'Education',
    'PopSocio_c':'PopSocio',
}

pop_df_merge.rename(index=str, columns=pop_rename, inplace=True)


### Session dataframe

session_df_merge = session_df.copy()

session_rename = {
    'PrimOccMuncode': 'MunicipalityDest',
    'HomeAdrMunCode': 'MunicipalityOrigin',
    'RespSex': 'Gender',
    'DiaryYear': 'Year',
}
session_df_merge.rename(index=str, columns=session_rename,  inplace=True)

In [36]:
### Merge the dataframes

# Set common columns used for indexing and merging
idx_list = ['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year']

pop_df_merge = pop_df_merge.groupby(idx_list + ['Sector'], as_index=False, observed=True).sum()

### Calculate the sums and percentages of each group

# Calculate the sums for each group and create a 'sums' column
pop_df_merge['sums'] = pop_df_merge.groupby(idx_list, as_index=False, observed=True)['Val'].transform('sum')

# Calculate the percentage and create a 'percent' column
pop_df_merge['percent'] = pop_df_merge.Val/pop_df_merge.sums


# Convert selected columns to strings and categories
for var in ['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender']:
    pop_df_merge[var] = pop_df_merge[var].astype(str)
    pop_df_merge[var] = pop_df_merge[var].astype('category')
    session_df_merge[var] = session_df_merge[var].astype(str)
    session_df_merge[var] = session_df_merge[var].astype('category')

### Filter and set indices for both DataFrames
# Drop rows with years outside the range 2009-2021 - due to missing data in the population dataset (Denmark Statistics)
session_df_merge_cond = session_df_merge[(session_df_merge['Year'] > 2008) & (session_df_merge['Year'] < 2022)].set_index(idx_list)
pop_df_merge_cond = pop_df_merge.copy().set_index(idx_list)

# Merge the DataFrames using an inner join
merged = session_df_merge_cond.join(pop_df_merge_cond, how='inner')

In [37]:
session_df_merge_cond

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,SessionId,InterviewType,DiaryDate,PseudoYear,DiaryMonth,DiaryWeekday,DiaryDaytype,HomeAdrNUTS,HomeAdrCityCode,HomeAdrCitySize,...,JstartDistNearestStation,DayJourneyType,DayPrimTargetMuncode,DayPrimTargetPurp,SessionWeight,WeightOver6,CarModelYear,FuelType,KidsBetween0and15,KidsBetween0and4
MunicipalityOrigin,Education,PopSocio,AgeGroup,Gender,Year,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
540,H30,3,65-69 years,1,2009,141381,0,14245,2008/9,1,4,32,DK032,11015.0,3310,...,-1.0,2,540.0,41.0,345.950094,-1.000000,2005,2.0,0,0
219,H30,1,45-49 years,2,2009,141385,2,14245,2008/9,1,4,32,DK013,10076.0,2334,...,1.0,12,270.0,11.0,225.445616,-1.000000,2004,1.0,1,0
259,H50,1,40-44 years,2,2009,141387,2,14245,2008/9,1,4,32,DK021,10048.0,703,...,4.3,21,259.0,41.0,268.293655,-1.000000,2001,1.0,1,0
849,H50,1,30-34 years,1,2009,141388,2,14245,2008/9,1,4,32,DK050,11269.0,2781,...,23.9,21,849.0,41.0,169.162727,-1.000000,1988,2.0,1,1
159,H60,1,45-49 years,2,2009,141389,2,14245,2008/9,1,4,32,DK012,1100.0,1167569,...,0.2,11,173.0,41.0,390.421343,-1.000000,-1,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,H10,3,over 69 years,2,2017,492153,1,17513,2017/18,12,3,11,DK013,10303.0,47294,...,1.6,11,223.0,33.0,691.723248,690.727223,-1,,0,0
461,H99,1,55-59 years,1,2016,492158,1,16814,2015/16,1,4,11,DK031,10677.0,175245,...,3.0,11,330.0,11.0,570.570769,76.887354,2008,2.0,0,0
167,H99,1,40-44 years,2,2016,492160,1,16809,2015/16,1,6,23,DK012,1100.0,1280371,...,1.0,11,101.0,31.0,482.031175,513.539634,2007,1.0,2,0
787,H30,1,55-59 years,1,2012,492359,1,15372,2011/12,2,4,11,DK050,11008.0,251,...,0.4,11,787.0,61.0,697.438411,-1.000000,2008,1.0,0,0


In [38]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1800612 entries, ('101', 'H10', '0', '15-19 years', '1', 2009) to ('860', 'H60', '3', '65-69 years', '2', 2021)
Columns: 119 entries, SessionId to percent
dtypes: category(56), float64(12), int64(51)
memory usage: 995.3 MB


In [39]:
### Sample the Sector variable based on the percentages of each group

# Keep only the columns needed for sampling
merged = merged[['SessionId', 'Sector', 'percent']].reset_index(drop=True)

# Define a sampling function for aggregation
def groupby_sample(x):
    if all(x.percent == 0.):  # Check if all percent values are 0
        return np.random.choice(x.Sector)
    else:
        return np.random.choice(x.Sector, p=x.percent)


# Apply the sampling function to each group
samp_df = merged.groupby('SessionId', as_index=False).apply(groupby_sample).reset_index()
samp_df = samp_df.drop(['index'], axis=1)
samp_df.columns = ['SessionId', 'Sector']

# Merge the sampled DataFrame with the original DataFrame
session_samp_df = samp_df.merge(session_df_merge_cond.reset_index(),on='SessionId', how='inner')

under16 = session_df_merge_cond.loc[session_df_merge_cond['RespAgeCorrect'] < 16]
over69 = session_df_merge_cond.loc[session_df_merge_cond['RespAgeCorrect'] > 69]

excluded_ageGroups = pd.concat([under16, over69]).reset_index()
session_allAges_df = pd.concat([excluded_ageGroups, session_samp_df])

In [40]:
session_allAges_df.head()

Unnamed: 0,MunicipalityOrigin,Education,PopSocio,AgeGroup,Gender,Year,SessionId,InterviewType,DiaryDate,PseudoYear,...,DayJourneyType,DayPrimTargetMuncode,DayPrimTargetPurp,SessionWeight,WeightOver6,CarModelYear,FuelType,KidsBetween0and15,KidsBetween0and4,Sector
0,101,H10,0,under 15 years,2,2009,141393,2,14245,2008/9,...,11,147.0,11.0,637.632539,-1.0,2001,1.0,5,1,
1,101,H10,0,under 15 years,1,2009,141400,2,14245,2008/9,...,21,101.0,41.0,324.925541,-1.0,-1,,1,0,
2,846,H10,3,under 15 years,1,2009,141405,2,14245,2008/9,...,1,846.0,1.0,371.440444,-1.0,2005,1.0,1,0,
3,630,H10,0,under 15 years,2,2009,141425,0,14245,2008/9,...,21,630.0,41.0,449.963478,-1.0,2003,1.0,1,0,
4,791,H10,0,under 15 years,2,2009,141463,0,14246,2008/9,...,11,791.0,43.0,348.350677,-1.0,2007,2.0,0,0,


In [41]:
zones = pd.read_csv('OTM_Zones_SesID_new.csv', sep=',')
zones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189365 entries, 0 to 189364
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   SessionId     189365 non-null  int64  
 1   Homeadr_OTM   63052 non-null   float64
 2   PrimOcc_OTM   47229 non-null   float64
 3   SDU_OTM       46066 non-null   float64
 4   DayStart_OTM  60396 non-null   float64
 5   JStart_OTM    56898 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 8.7 MB


In [42]:
session_Zones_df = session_allAges_df.merge(zones, on='SessionId', how='left')

gmmZone_list = ['DayStartGMMzone', 'HomeAdrGMMzone',
                'JstartGMMzone', 'PrimOccGMMzone', 'SduGMMzone']
session_Zones_df[gmmZone_list] = session_Zones_df[gmmZone_list].astype('float64')

mapping_columns = {'DayStart_OTM': 'DayStartGMMzone', 'Homeadr_OTM': 'HomeAdrGMMzone',
                   'JStart_OTM': 'JstartGMMzone', 'PrimOcc_OTM': 'PrimOccGMMzone', 'SDU_OTM': 'SduGMMzone'}

for index, row in session_Zones_df.iterrows():
    for otm_col, gmm_col in mapping_columns.items():
        if not pd.isnull(row[otm_col]):
            session_Zones_df.at[index, gmm_col] = row[otm_col]

session_Zones_df[gmmZone_list] = session_Zones_df[gmmZone_list].astype('category')
session_Zones_df.rename(columns={'DayStartGMMzone': 'DayStartZone', 'HomeAdrGMMzone': 'HomeAdrZone',
                         'JstartGMMzone': 'JstartZone', 'PrimOccGMMzone': 'PrimOccZone', 'SduGMMzone': 'SduZone'}, inplace=True)
session_Zones_df.drop(['DayStart_OTM', 'Homeadr_OTM', 'JStart_OTM',
              'PrimOcc_OTM', 'SDU_OTM'], axis=1, inplace=True)

In [43]:
total_df = session_Zones_df.replace(-1, np.nan)
total_df.describe()._append(samp_df.isnull().sum().rename('isnull'))

Unnamed: 0,Year,SessionId,DiaryDate,HomeAdrCitySize,HomeAdrDistNearestStation,RespYearBorn,RespAgeSimple,RespAgeCorrect,WorkHoursPw,WorkatHomeDayspM,...,TotalGramCO2eq,TotalFuelConsumpMJ,DayNumJourneys,JstartDistNearestStation,SessionWeight,WeightOver6,CarModelYear,KidsBetween0and15,KidsBetween0and4,Sector
count,139632.0,139632.0,139632.0,117896.0,138234.0,139632.0,139632.0,139632.0,63277.0,57953.0,...,139632.0,139632.0,139632.0,123604.0,135811.0,54089.0,112012.0,139632.0,139632.0,
mean,2014.125458,306595.478164,16298.627363,321060.0,4.377234,1969.573278,44.55218,44.067134,36.813534,0.811209,...,4250.973172,59.492327,1.187217,4.407426,414.273581,531.482729,2007.106792,0.515297,0.105019,
std,3.990929,111595.320298,1460.510413,516411.7,5.823482,22.026797,21.563422,21.566361,8.959536,3.242307,...,17724.888952,247.697343,0.865417,5.854146,284.94559,317.680621,6.902751,0.86374,0.361606,
min,2009.0,141381.0,14245.0,200.0,0.0,1912.0,6.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.000984,1.000778,1903.0,0.0,0.0,
25%,2010.0,189392.75,14934.0,3798.0,0.8,1951.0,25.0,24.0,36.0,0.0,...,0.0,0.0,1.0,0.8,201.050622,331.602883,2003.0,0.0,0.0,
50%,2014.0,325390.5,16082.0,27194.0,1.9,1968.0,46.0,45.0,37.0,0.0,...,0.0,0.0,1.0,1.9,366.487689,482.310741,2007.0,0.0,0.0,
75%,2018.0,413973.25,17648.0,261570.0,5.5,1990.0,62.0,62.0,37.0,0.0,...,3428.1758,47.63937,2.0,5.6,555.511004,670.061898,2012.0,1.0,0.0,
max,2021.0,492360.0,18992.0,1336982.0,78.0,2015.0,108.0,107.0,168.0,99.0,...,760803.38,10743.8,11.0,78.0,4328.352322,5500.727074,2021.0,8.0,3.0,
isnull,,0.0,,,,,,,,,...,,,,,,,,,,0.0


In [44]:
simulation_values = ['Year', 'SessionId', 'Gender', 'RespAgeCorrect', 'RespEdulevel', 'AgeGroup', 'Education', 'Handicap', 'PopSocio', 'RespPrimOcc', 'Sector',
                     'FamNumAdults', 'FamNumPers', 'KidsBetween0and15', 'KidsBetween0and4',
                     'HousehNumAdults', 'HousehNumPers', 'HousehNumcars', 'HousehCarOwnership', 'CarModelYear', 'FuelType',
                     'IncRespondent2000', 'IncFamily2000', 'IncHouseh2000', 'IncSpouse2000',
                     'MunicipalityOrigin', 'MunicipalityDest', 'HomeAdrZone', 'PrimOccZone',
                     'HwDayspW', 'WorkHoursPw', 'WorkHourType',
                     'RespHasBicycle', 'ResphasDrivlic', 'RespHasRejsekort', 'RespIsmemCarshare', 

                     'HomeParkPoss', 'RespHasSeasonticket', 'HousehAccomodation', 'HousehAccOwnorRent', 
                     'PosInFamily', 'PrimModeDay','ModeChainTypeDay']
                    

sim_df = total_df[simulation_values].copy()

# These do not make sense, since they have attributes that are only specific for employees
# 'WorkPubPriv', 'WorkParkPoss',

# These variable have to specific attribvutes that do not fit fot the simulation model
# 'NuclFamType',

# These day variables are not relevant
# 'DayJourneyType', 'DayPrimTargetPurp', 'DayStartJourneyRole', 'DayStartPurp',

In [45]:
sim_df

Unnamed: 0,Year,SessionId,Gender,RespAgeCorrect,RespEdulevel,AgeGroup,Education,Handicap,PopSocio,RespPrimOcc,...,ResphasDrivlic,RespHasRejsekort,RespIsmemCarshare,HomeParkPoss,RespHasSeasonticket,HousehAccomodation,HousehAccOwnorRent,PosInFamily,PrimModeDay,ModeChainTypeDay
0,2009,141393,2,15,3.0,under 15 years,H10,2.0,0,107.0,...,-18.0,,,,2.0,1.0,1.0,20.0,32.0,130.0
1,2009,141400,1,14,2.0,under 15 years,H10,2.0,0,116.0,...,-18.0,,,,2.0,3.0,3.0,20.0,32.0,110.0
2,2009,141405,1,14,3.0,under 15 years,H10,2.0,3,390.0,...,-18.0,,,,2.0,1.0,1.0,20.0,,
3,2009,141425,2,10,1.0,under 15 years,H10,2.0,0,107.0,...,-18.0,,,,1.0,4.0,1.0,20.0,1.0,1.0
4,2009,141463,2,12,1.0,under 15 years,H10,2.0,0,107.0,...,-18.0,,,,2.0,4.0,2.0,20.0,11.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139627,2015,489506,1,49,13.0,45-49 years,H60,2.0,1,210.0,...,1.0,,2.0,4.0,2.0,1.0,1.0,11.0,11.0,11.0
139628,2021,491403,2,22,5.0,20-24 years,H20,2.0,0,120.0,...,1.0,30.0,2.0,212.0,2.0,,,10.0,33.0,110.0
139629,2020,492043,1,67,13.0,65-69 years,H60,2.0,1,221.0,...,1.0,2.0,2.0,4.0,2.0,1.0,,11.0,11.0,11.0
139630,2018,492046,2,17,3.0,15-19 years,H10,2.0,0,116.0,...,1.0,99.0,2.0,5.0,1.0,1.0,1.0,20.0,11.0,21.0


In [46]:
count_nan_values(sim_df)

                 Column  NaN Count
0           HomeAdrZone          1
1   RespHasSeasonticket          1
2        RespHasBicycle          1
3         HousehNumcars          2
4              Handicap          5
5        ResphasDrivlic          6
6    HousehCarOwnership          9
7         HousehNumPers         29
8            FamNumPers         29
9          FamNumAdults         29
10          PosInFamily         31
11      HousehNumAdults        106
12             FuelType      21671
13   HousehAccomodation      22782
14   HousehAccOwnorRent      22811
15     ModeChainTypeDay      24213
16          PrimModeDay      24213
17         CarModelYear      27620
18    RespIsmemCarshare      37901
19               Sector      38382
20    IncRespondent2000      41756
21     MunicipalityDest      42754
22          PrimOccZone      43259
23             HwDayspW      48044
24        IncFamily2000      54314
25        IncHouseh2000      54327
26         HomeParkPoss      65384
27          WorkHour

In [47]:
def replace_nans_categories(df, column_name):
    # Copy the DataFrame
    filtered_df = df[['SessionId', 'MunicipalityOrigin', 'Education',
                      'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name]].copy()

    # Set indices for both DataFrames
    idx = ['MunicipalityOrigin', 'Education','PopSocio', 'AgeGroup', 'Gender', 'Year']

    # Calculate group sizes
    group_df = filtered_df.copy().drop(['SessionId'], axis=1)
    group_df['counter'] = 1
    group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],
                                      as_index=False, dropna=True).sum()
    group_df['sums'] = group_df.groupby(['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup',
                                        'Gender', 'Year'], as_index=False, observed=True)['counter'].transform('sum')

    group_df['sums'] = group_df['sums'].replace(0, 1)
    group_df['Percentage'] = (group_df.counter / group_df.sums)
    group_df.drop(['counter', 'sums'], axis=1, inplace=True)

    # Set indices for both DataFrames

    nan_df = filtered_df.loc[filtered_df[column_name].isna()==True]

    nan_df = nan_df.drop(columns=[column_name])
    group_df_c = group_df.set_index(idx)
    nan_df_c = nan_df.set_index(idx)

    # Join DataFrames
    matched = nan_df_c.join(group_df_c, how='inner')

    # Define a sampling function for aggregation
    def groupby_sample(x):
        
        if all(x.Percentage == 0.):  # Check if all percent values are 0
            return np.random.choice(x[column_name])
        else:
            return np.random.choice(x[column_name], p=x.Percentage)

    # Apply the sampling function
    matched = matched.groupby('SessionId', as_index=False).apply(groupby_sample).reset_index()
    matched = matched.drop(['index'], axis=1)
    matched.columns = ['SessionId', column_name]

    # Merge the results back to the original DataFrame
    df = df.merge(matched, on='SessionId', how='left')
    df[column_name] = df[column_name + '_x'].fillna(df[column_name + '_y'])
    df = df.drop([column_name + '_x', column_name + '_y'], axis=1)

    return df

In [48]:
def replace_nans_numerical(df, grouping_vars = ['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup','Gender']):

    categories_list = df.select_dtypes(include='category').columns.tolist()
    numericals_list = df.select_dtypes(exclude='category').columns[df.select_dtypes(exclude='category').isna().any()].tolist()


    # Divide the dataset
    df_auxiliar = df.drop(columns = numericals_list) # Dataframe which we don't modify
    df_missing  = df[grouping_vars + numericals_list + ['SessionId']] # Dataframe with missings

    # NaN replacement
    f = lambda x: x.fillna(np.random.choice(x)) # Function to fill the na values using a random element 
    df_missing = df_missing.groupby(grouping_vars).transform(f) # Applying the function to the grouping
    
    return df_missing.merge(df_auxiliar, on='SessionId')

In [49]:
def fill_missing_values(df, column, pop_socio_categories=['0', '1', '2', '3']):
    df[column] = df[column].cat.add_categories(['MISSING'])
    condition = (df['PopSocio'].isin(pop_socio_categories)) & (df[column].isna())
    df.loc[condition, column] = 'MISSING'
    return df

### Replace NaNs in categories


In [50]:
### Replace nans in RespHasRejsekort
#sim_df[['SessionId','PopSocio', 'RespHasRejsekort']].loc[sim_df['RespHasRejsekort'].isna() == True]
sim_df = replace_nans_categories(sim_df, 'RespHasRejsekort')

### Replace Nans in RespIsmemCarshar
# sim_df[['SessionId','PopSocio', 'RespIsmemCarshare']].loc[sim_df['RespIsmemCarshare'].isna() == True]
sim_df = replace_nans_categories(sim_df, 'RespIsmemCarshare')

### Replace Nans in Sector
# sim_df[['SessionId','PopSocio', 'Sector']].loc[sim_df['Sector'].isna() == True]
sim_df.loc[(sim_df['PopSocio'].isin(['2', '3'])) & (sim_df['Sector'].isna()), 'Sector'] = 'Activity not stated'
sim_df.loc[(sim_df['PopSocio'].isin(['0'])) & (sim_df['Sector'].isna()), 'Sector'] = 'Education'
sim_df = replace_nans_categories(sim_df, 'Sector')

### Replace Nans in HwDayspW
# sim_df[['SessionId','PopSocio', 'HwDayspW']].loc[sim_df['HwDayspW'].isna() == False]
sim_df.loc[(sim_df['PopSocio'].isin(['0', '2', '3'])) &(sim_df['HwDayspW'].isna()), 'HwDayspW'] = 0
sim_df = replace_nans_categories(sim_df, 'HwDayspW')

### Replace Nans in WorkHourType
# sim_df[['SessionId','PopSocio', 'WorkHourType']].loc[sim_df['WorkHourType'].isna() == False]
sim_df['WorkHourType'] = sim_df['WorkHourType'].cat.add_categories(['MISSING'])
sim_df['WorkHourType'] = sim_df['WorkHourType'].cat.add_categories(['Not working'])
sim_df.loc[(sim_df['PopSocio'].isin(['0', '2', '3'])) & (sim_df['WorkHourType'].isna()), 'WorkHourType'] = 'Not working'
sim_df.loc[(sim_df['PopSocio'].isin(['1'])) & (sim_df['WorkHourType'].isna()), 'WorkHourType'] = 'MISSING'

### Replace Nans in FuelType
# sim_df[['SessionId','PopSocio', 'FuelType']].loc[(sim_df['FuelType'].isna() == True) & (sim_df['HousehNumcars'] >0)]
# fueltype_to_sample = sim_df.loc[(sim_df['FuelType'].isna()) & (sim_df['HousehNumcars'] > 0)]
# sim_df.loc[fueltype_to_sample.index, 'FuelType'] = np.random.choice(sim_df['FuelType'].dropna().unique(), size=len(fueltype_to_sample))
sim_df['FuelType'] = sim_df['FuelType'].cat.add_categories(['NoCar'])
sim_df.loc[(sim_df['FuelType'].isna()) & (sim_df['HousehNumcars'] == 0), 'FuelType'] = 'NoCar'
sim_df = replace_nans_categories(sim_df, 'FuelType')

### Replace Nans in CarModelYear
carmodelyear_to_sample = sim_df.loc[(sim_df['CarModelYear'].isna()) & (sim_df['HousehNumcars'] > 0)]
sim_df.loc[carmodelyear_to_sample.index, 'CarModelYear'] = np.random.choice(sim_df['CarModelYear'].dropna().unique(), size=len(carmodelyear_to_sample))
sim_df.loc[(sim_df['CarModelYear'].isna()) & (sim_df['HousehNumcars'] == 0), 'CarModelYear'] = -1
sim_df['CarModelYear'] = sim_df['CarModelYear'].astype('category')

### Replace Nans in MunicipalityDest
# Replacing Nans with 'Missing' based on the PopSocio category
columns_to_process = ['MunicipalityDest','PrimOccZone', 'HomeAdrZone']  # Add all columns to process

for col in columns_to_process:
    sim_df = fill_missing_values(sim_df, col)

  group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],
  group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],
  group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],
  group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],
  group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],


In [51]:
count_nan_values(sim_df)

                 Column  NaN Count
0   RespHasSeasonticket          1
1        RespHasBicycle          1
2         HousehNumcars          2
3          CarModelYear          2
4              Handicap          5
5        ResphasDrivlic          6
6    HousehCarOwnership          9
7          FamNumAdults         29
8            FamNumPers         29
9         HousehNumPers         29
10          PosInFamily         31
11      HousehNumAdults        106
12   HousehAccomodation      22782
13   HousehAccOwnorRent      22811
14     ModeChainTypeDay      24213
15          PrimModeDay      24213
16    IncRespondent2000      41756
17        IncFamily2000      54314
18        IncHouseh2000      54327
19         HomeParkPoss      65384
20          WorkHoursPw      76355
21        IncSpouse2000      87322


In [52]:
new_cat = ['HomeParkPoss', 'RespHasSeasonticket', 'HousehAccomodation', 'HousehAccOwnorRent', 'PosInFamily', 'PrimModeDay','ModeChainTypeDay']

In [53]:
for i in new_cat:
    print(i)
    sim_df = replace_nans_categories(sim_df, i)


HomeParkPoss


  group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],


RespHasSeasonticket


  group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],


HousehAccomodation


  group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],


HousehAccOwnorRent


  group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],


PosInFamily


  group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],


PrimModeDay


  group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],


ModeChainTypeDay


  group_df = group_df.groupby(by=['MunicipalityOrigin', 'Education', 'PopSocio', 'AgeGroup', 'Gender', 'Year', column_name],


### Replace NaNs in Numerical columns

In [54]:
### Replace Nans in WorkHoursPw
# sim_df[['PopSocio','WorkHoursPw','RespAgeCorrect']].loc[(sim_df['WorkHoursPw'].isna()==True) & (sim_df['PopSocio']!='1')]
sim_df.loc[(sim_df['PopSocio'].isin(['0', '2', '3'])) & (sim_df['WorkHoursPw'].isna()), 'WorkHoursPw'] = 0

In [55]:
count_nan_values(sim_df)

                Column  NaN Count
0       RespHasBicycle          1
1        HousehNumcars          2
2         CarModelYear          2
3             Handicap          5
4       ResphasDrivlic          6
5   HousehCarOwnership          9
6         FamNumAdults         29
7           FamNumPers         29
8        HousehNumPers         29
9      HousehNumAdults        106
10         WorkHoursPw        443
11   IncRespondent2000      41756
12       IncFamily2000      54314
13       IncHouseh2000      54327
14       IncSpouse2000      87322


In [56]:
# Add additional binary variable to identify missing income
sim_df["MissIncome"] = np.where(sim_df["IncRespondent2000"].isna(), 1, 0)

# Limit outliers
sim_df["IncRespondent2000"] = sim_df["IncRespondent2000"].loc[(sim_df["IncRespondent2000"] >= 0) & (sim_df["IncRespondent2000"] <= 7000)]
sim_df["IncFamily2000"] = sim_df["IncFamily2000"].loc[(sim_df["IncFamily2000"] >= 0) & (sim_df["IncFamily2000"] <= 9000)]

In [57]:
def prepare_data(df):
    df_sel = df[['SessionId', 'MunicipalityOrigin', 'AgeGroup', 'Gender', 'PopSocio', 'IncRespondent2000', 'IncFamily2000']].copy()

    df_filtered = df_sel.loc[(df_sel['IncRespondent2000'].notna()) & (df_sel['IncFamily2000'].notna())]
    df_nan = df_sel[df_sel['IncRespondent2000'].isna() & df_sel['IncFamily2000'].isna()]

    return df_filtered, df_nan


def replace_nan_values_4index(df_filtered, df_nan):
    inc_idx = ['MunicipalityOrigin', 'PopSocio', 'AgeGroup', 'Gender']
    inc_group = df_nan.groupby(inc_idx, as_index=False).count()

    return_df = pd.DataFrame()

    for row in inc_group.iterrows():
        group_notnan = df_filtered.loc[(df_filtered['MunicipalityOrigin'] == row[1]['MunicipalityOrigin']) & (
            df_filtered['PopSocio'] == row[1]['PopSocio']) & (df_filtered['AgeGroup'] == row[1]['AgeGroup']) & (df_filtered['Gender'] == row[1]['Gender'])]

        group_nan = df_nan.loc[(df_nan['MunicipalityOrigin'] == row[1]['MunicipalityOrigin']) & (
            df_nan['PopSocio'] == row[1]['PopSocio']) & (df_nan['AgeGroup'] == row[1]['AgeGroup']) & (df_nan['Gender'] == row[1]['Gender'])]

        if len(group_notnan) != 0:
            replacement_values = group_notnan[['IncRespondent2000', 'IncFamily2000']].sample(
                n=len(group_nan), replace=True, ignore_index=True).values

            group_nan['IncRespondent2000'] = replacement_values[:, 0]
            group_nan['IncFamily2000'] = replacement_values[:, 1]

        return_df = pd.concat([return_df, group_nan])

    return return_df

def replace_nan_values_3index(df_filtered, df_nan):
    inc_idx = ['PopSocio', 'AgeGroup', 'Gender']
    inc_group = df_nan.groupby(inc_idx, as_index=False).count()

    return_df = pd.DataFrame()

    for row in inc_group.iterrows():
        group_notnan = df_filtered.loc[(df_filtered['PopSocio'] == row[1]['PopSocio']) & (df_filtered['AgeGroup'] == row[1]['AgeGroup']) & (df_filtered['Gender'] == row[1]['Gender'])]

        group_nan = df_nan.loc[(df_nan['PopSocio'] == row[1]['PopSocio']) & (df_nan['AgeGroup'] == row[1]['AgeGroup']) & (df_nan['Gender'] == row[1]['Gender'])]

        if len(group_notnan) != 0:
            replacement_values = group_notnan[['IncRespondent2000', 'IncFamily2000']].sample(
                n=len(group_nan), replace=True, ignore_index=True).values

            group_nan['IncRespondent2000'] = replacement_values[:, 0]
            group_nan['IncFamily2000'] = replacement_values[:, 1]

        return_df = pd.concat([return_df, group_nan])

    return return_df


def rename_merge_columns(df, replaced_nan_df):
    replaced_nan_df.rename(columns={'IncRespondent2000': 'IncomePerson', 'IncFamily2000': 'IncomeFam'}, inplace=True)
    replaced_nan_df = replaced_nan_df[['SessionId', 'IncomePerson', 'IncomeFam']].reset_index()

    renamed_df = pd.merge(df, replaced_nan_df, on='SessionId', how='left')
    return renamed_df


def fill_drop_columns(df):
    df['IncRespondent2000'].fillna(df['IncomePerson'], inplace=True)
    df['IncFamily2000'].fillna(df['IncomeFam'], inplace=True)
    df.drop(columns=['IncomeFam', 'IncomePerson','index'], inplace=True)
    return df


In [58]:
### Replace Nans in IncRespondent2000 and IncFamily2000 based on dependent logic

df_notnan, df_nan = prepare_data(sim_df)
replaced_nan_df = replace_nan_values_4index(df_notnan, df_nan)
sim_df_sample_incFam = rename_merge_columns(sim_df, replaced_nan_df)
sim_df_sample_incFam= fill_drop_columns(sim_df_sample_incFam)

sim_df_sample_incFam.loc[(sim_df_sample_incFam['IncSpouse2000'].isna() == False) & 
               (sim_df_sample_incFam['IncFamily2000'].isna()) & 
               (sim_df_sample_incFam['IncRespondent2000'].isna() == False), 'IncFamily2000'] = sim_df_sample_incFam.IncRespondent2000 + sim_df_sample_incFam.IncSpouse2000

sim_df_sample_incFam.loc[(sim_df_sample_incFam['IncFamily2000'].isna()) & 
               (sim_df_sample_incFam['IncRespondent2000'].isna() == False) &
               (sim_df_sample_incFam['IncSpouse2000'].isna()), 'IncFamily2000'] = sim_df_sample_incFam.IncRespondent2000

df_notnan, df_nan = prepare_data(sim_df_sample_incFam)
replaced_nan_df = replace_nan_values_3index(df_notnan, df_nan)
sim_df_sample_incResp = rename_merge_columns(sim_df_sample_incFam, replaced_nan_df)
sim_df_sample_incResp= fill_drop_columns(sim_df_sample_incResp)

sim_df_sample_incResp.loc[(sim_df_sample_incResp['IncSpouse2000'].isna() == False) & 
               (sim_df_sample_incResp['IncRespondent2000'].isna()) & 
               (sim_df_sample_incResp['IncFamily2000'].isna() == False), 'IncRespondent2000'] = sim_df_sample_incResp.IncFamily2000 - sim_df_sample_incResp.IncSpouse2000

sim_df_sample_incResp.loc[(sim_df_sample_incResp['IncRespondent2000'].isna()) & 
               (sim_df_sample_incResp['IncFamily2000'].isna() == False) &
               (sim_df_sample_incResp['IncSpouse2000'].isna()), 'IncRespondent2000'] = sim_df_sample_incResp.IncFamily2000
               
sim_df_processed =  sim_df_sample_incResp.drop(columns=['IncSpouse2000','IncHouseh2000'])

  inc_group = df_nan.groupby(inc_idx, as_index=False).count()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_nan['IncRespondent2000'] = replacement_values[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_nan['IncFamily2000'] = replacement_values[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_nan['IncRespondent2000'] 

In [59]:
sim_df_processed = replace_nans_numerical(sim_df_processed)
sim_df_processed = replace_nans_numerical(sim_df_processed, grouping_vars = ['MunicipalityOrigin', 'Education', 'PopSocio'])
sim_df_processed = replace_nans_numerical(sim_df_processed, grouping_vars = ['MunicipalityOrigin', 'PopSocio'])
sim_df_processed = replace_nans_numerical(sim_df_processed, grouping_vars = ['PopSocio'])

  df_missing = df_missing.groupby(grouping_vars).transform(f) # Applying the function to the grouping
  df_missing = df_missing.groupby(grouping_vars).transform(f) # Applying the function to the grouping
  df_missing = df_missing.groupby(grouping_vars).transform(f) # Applying the function to the grouping
  df_missing = df_missing.groupby(grouping_vars).transform(f) # Applying the function to the grouping


In [60]:
count_nan_values(sim_df_processed)

           Column  NaN Count
0  RespHasBicycle          1
1    CarModelYear          2
2        Handicap          5
3  ResphasDrivlic          6


### Finally drop all rows that still include an NaN

In [61]:
final_df = sim_df_processed.dropna()

#### Check for Nan Values

In [62]:
count_nan_values(final_df)

Empty DataFrame
Columns: [Column, NaN Count]
Index: []


### Safe Dataframe


In [63]:
### Drop columns with high correlation

final_df.drop(columns=['RespEdulevel', 'AgeGroup','SessionId'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(columns=['RespEdulevel', 'AgeGroup'], inplace=True)


In [None]:
# %cd /home/s212574/snap/snapd-desktop-integration/83/Documents/Thesis/MSc_PopSyn/Sigga_Luis/Data

In [69]:
os.getcwd()


'/Users/luis/Desktop/Data_extracted'

In [64]:
final_df

Unnamed: 0,SessionId,FamNumPers,HousehNumAdults,IncFamily2000,WorkHoursPw,FamNumAdults,HousehNumPers,HousehCarOwnership,IncRespondent2000,HousehNumcars,...,HwDayspW,FuelType,HomeParkPoss,RespHasSeasonticket,HousehAccomodation,HousehAccOwnorRent,PosInFamily,PrimModeDay,ModeChainTypeDay,MissIncome
0,141393,8.0,2.0,405.0,0.0,2.0,8.0,1.0,0.0,1.0,...,5.0,1.0,133.0,2.0,1.0,1.0,20.0,32.0,130.0,1
1,141400,3.0,1.0,236.0,0.0,1.0,3.0,0.0,0.0,0.0,...,5.0,NoCar,112.0,2.0,3.0,3.0,20.0,32.0,110.0,1
2,141405,4.0,2.0,424.0,0.0,2.0,4.0,0.0,424.0,1.0,...,0.0,1.0,233.0,2.0,1.0,1.0,20.0,12.0,50.0,1
3,141425,4.0,2.0,0.0,0.0,2.0,4.0,2.0,0.0,2.0,...,5.0,1.0,212.0,1.0,4.0,1.0,20.0,1.0,1.0,0
4,141463,3.0,2.0,509.0,0.0,2.0,3.0,1.0,0.0,1.0,...,5.0,2.0,233.0,2.0,4.0,2.0,20.0,11.0,21.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139627,489506,4.0,3.0,1064.0,37.0,3.0,4.0,1.0,608.0,1.0,...,5.0,2.0,4.0,2.0,1.0,1.0,11.0,11.0,11.0,0
139628,491403,1.0,1.0,72.0,0.0,1.0,1.0,0.0,72.0,0.0,...,4.0,NoCar,212.0,2.0,3.0,3.0,10.0,33.0,110.0,0
139629,492043,2.0,2.0,552.0,50.0,2.0,2.0,2.0,368.0,2.0,...,5.0,2.0,4.0,2.0,1.0,3.0,11.0,11.0,11.0,0
139630,492046,5.0,5.0,173.0,0.0,5.0,5.0,2.0,0.0,2.0,...,5.0,1.0,5.0,1.0,1.0,1.0,20.0,11.0,21.0,1


In [65]:
final_df.to_csv('simulationData.csv', sep=',', index=False)

In [66]:
f = pd.read_csv('simulationData.csv', sep=',')

  f = pd.read_csv('simulationData.csv', sep=',')


In [67]:
f.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139619 entries, 0 to 139618
Data columns (total 40 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   SessionId            139619 non-null  int64  
 1   FamNumPers           139619 non-null  float64
 2   HousehNumAdults      139619 non-null  float64
 3   IncFamily2000        139619 non-null  float64
 4   WorkHoursPw          139619 non-null  float64
 5   FamNumAdults         139619 non-null  float64
 6   HousehNumPers        139619 non-null  float64
 7   HousehCarOwnership   139619 non-null  float64
 8   IncRespondent2000    139619 non-null  float64
 9   HousehNumcars        139619 non-null  float64
 10  Year                 139619 non-null  int64  
 11  Gender               139619 non-null  int64  
 12  RespAgeCorrect       139619 non-null  int64  
 13  Education            139619 non-null  object 
 14  Handicap             139619 non-null  float64
 15  PopSocio         

In [68]:
f.columns.tolist()

['SessionId',
 'FamNumPers',
 'HousehNumAdults',
 'IncFamily2000',
 'WorkHoursPw',
 'FamNumAdults',
 'HousehNumPers',
 'HousehCarOwnership',
 'IncRespondent2000',
 'HousehNumcars',
 'Year',
 'Gender',
 'RespAgeCorrect',
 'Education',
 'Handicap',
 'PopSocio',
 'RespPrimOcc',
 'KidsBetween0and15',
 'KidsBetween0and4',
 'CarModelYear',
 'MunicipalityOrigin',
 'MunicipalityDest',
 'HomeAdrZone',
 'PrimOccZone',
 'WorkHourType',
 'RespHasBicycle',
 'ResphasDrivlic',
 'RespHasRejsekort',
 'RespIsmemCarshare',
 'Sector',
 'HwDayspW',
 'FuelType',
 'HomeParkPoss',
 'RespHasSeasonticket',
 'HousehAccomodation',
 'HousehAccOwnorRent',
 'PosInFamily',
 'PrimModeDay',
 'ModeChainTypeDay',
 'MissIncome']

In [72]:
final_df.HousehCarOwnership.unique()

array([ 1.,  0.,  2.,  3.,  4.,  5., 20.,  6.,  9.,  7., 10.,  8., 12.,
       11.])

In [74]:
session_df.HousehCarOwnership.unique()

array([ 1,  2,  0,  3,  4,  5, -1,  7,  6,  8, 10,  9, 12, 20, 11, 16])