In [None]:
## This version consider children dependency when people <16 are in the household

## before, this value was < 18

In [None]:
import pandas as pd
import numpy as np
import os
import random

## Read PSU file

In [None]:
psu_dir = r'' # use your path


df_psu = pd.read_csv(
    psu_dir,
    sep='\t',
    usecols=['SurveyYear', 'PSUID', 'PSUGOR_B02ID'],
#     dtype={"W5": np.float64,}
)


In [None]:
len(df_psu)

## Select those PSU areas relevant for the project (all England except London)

In [None]:
# Keep only those areas that are England except London
#Value = 1.0	Label = North East
#Value = 2.0	Label = North West
#Value = 3.0	Label = Yorkshire and the Humber
#Value = 4.0	Label = East Midlands
#Value = 5.0	Label = West Midlands
#Value = 6.0	Label = East of England
#Value = 7.0	Label = London
#Value = 8.0	Label = South East
#Value = 9.0	Label = South West
#Value = 10.0	Label = Wales
#Value = 11.0	Label = Scotland
#Value = -10.0	Label = DEAD
#Value = -9.0	Label = DNA
#Value = -8.0	Label = NA

area_list = [1,2,3,4,5,6,8,9]

df_psu_area = df_psu[df_psu['PSUGOR_B02ID'].isin(area_list)]

In [None]:
len(df_psu_area)

## Read day file

In [None]:
# Import the file with the days related to each individual
days_dir = r'' # use your path


df_days = pd.read_csv(
    days_dir,
    sep='\t',
    usecols=['DayID',               # ID given to all trips made by an individual on a given travel day - Created in SQL
             'IndividualID',        # Individual unique ID - Created in SQL
             'HouseholdID',         # Household unique ID - Created in SQL
             'PSUID',               # PSU unique ID - Created in SQL
             'TravelWeekDay_B01ID', # Day of week trip took place
             'TravelDayType_B01ID', # Type of day trip took place on (2008 onwards)
             'TravelMonth_B01ID',   # Month of year trip took place - coded month
             'TravelYear',          # Year of trip
             'TravelDate']          # Trip date      
)
#persons_in.head()

In [None]:
len(df_days)

In [None]:
# Create a list with all persons unique ID values
psu_area_list = df_psu_area['PSUID'].unique().tolist()

## Keep only those days that are within the PSU and were done during "school term-time" (TravelDayType_B01ID = 3)

In [None]:
# Keep only those days that belong to the people selected (2012-2019 and all England except London)
df_days_NTS = df_days.loc[(df_days['PSUID'].isin(psu_area_list)) &
                         (df_days['TravelYear'] >= 2011) &
                         (df_days['TravelYear'] <= 2019) & 
                         (df_days['TravelDayType_B01ID'] == 3)]


In [None]:
len(df_days_NTS)

In [None]:
df_days_NTS['TravelDate'].min()

In [None]:
df_days_NTS['TravelDate'].max()

In [None]:
# Create a list with all persons unique ID values
days_persons_NTS_list = df_days_NTS['IndividualID'].unique().tolist()

In [None]:
len(days_persons_NTS_list)

In [None]:
days_persons_NTS_list

In [None]:
# Create a list with all PSU unique ID values
days_PSU_NTS_list = df_days_NTS['PSUID'].unique().tolist()

In [None]:
# Keep only those PSU that belong to days_PSU_NTS_list
df_psu_NTS = df_psu_area.loc[(df_psu_area['PSUID'].isin(days_PSU_NTS_list))]


In [None]:
len(df_psu_NTS)

## Read individuals file

In [None]:
persons_dir = r'' # use your path


df_persons = pd.read_csv(
    persons_dir,
    sep='\t',
    usecols=['SurveyYear',          # survey year 
             'IndividualID',        # person unique ID
             'HouseholdID',         # household unique ID 
             'PSUID',               # PSU unique ID
             'Age',                 # Age (numeric)
             'Sex_B01ID',           # Sex 
             'MarStat_B01ID',       # Marital status 
             'DrivLic_B01ID',       # Driving licence
             'CarAccess_B01ID',     # Car access
             'OwnCycle_B01ID',      # Own or use a bicycle until 2017
             'OwnCycleN_B01ID',     # Own or use a bicycle from 2018
             'IndIncome2002_B01ID', # Individual Income - 2002 bandings - 23 categories
             'EcoStat_B01ID',       # Working status of individual - 11 categories
             'XSOC2010_B02ID']      # Standard Occupational Classification (SOC) - 2010 classification - summary - 9 categories
)
#persons_in.head()

In [None]:
len(df_persons)

In [None]:
# Keep only those days that belong to the people selected (2012-2019 and all England except London)
df_persons_NTS = df_persons.loc[(df_persons['IndividualID'].isin(days_persons_NTS_list))]


In [None]:
len(df_persons_NTS)

In [None]:
# Percentage of NTS people (assuming each trip belongs to a different person) and SPENSER people: 
450517/2645517*100

## Check if those individuals selected are within the time and geographical frame:

In [None]:
len(df_persons_NTS.loc[~df_persons_NTS['PSUID'].isin(psu_area_list)])

In [None]:
len(df_persons_NTS.loc[~df_persons_NTS['IndividualID'].isin(days_persons_NTS_list)])

## Start updating column names and attribute values in order to be used later with SPENSER data

In [None]:
df_persons_NTS.rename(
    columns={  # rename data
        'Sex_B01ID': 'Sex',
        'MarStat_B01ID': 'Marital_status',
        'OwnCycle_B01ID': 'Bike_access2018',
        'DrivLic_B01ID': 'Driving_license',
        'CarAccess_B01ID': 'Car_access',
        'IndIncome2002_B01ID': 'Income',
        'EcoStat_B01ID': 'Economic_activity',
        'XSOC2010_B02ID': 'Occupation',
        'OwnCycleN_B01ID': 'Bike_access2019'
    },
                inplace=True)

#persons_in.head()

In [None]:
df_persons_NTS.dtypes

In [None]:
df_persons_NTS

### Check attributes of each column (values and type):

#### Sex:

In [None]:
#check the values that column "Marital status" has:
c = df_persons_NTS['Sex'].unique()
print(sorted(c))

In [None]:
df_persons_NTS['Sex'].dtypes

#### Age:

In [None]:
#check the values that column "Marital status" has:
d = df_persons_NTS['Age'].unique()
print(sorted(d))

In [None]:
df_persons_NTS['Age'].dtypes

#### Marital status:

In [None]:
#check the values that column "Marital status" has:
e = df_persons_NTS['Marital_status'].unique()
print(sorted(e))

In [None]:
df_persons_NTS.groupby('Marital_status').count()

##### Marital status values need to be updated in order to match them with SPENSER data

In [None]:
# Change values of the marital status

## 'DNA' was observed that is related to people aged 0 - 15. So they are going to be considered as 'Single'.
## There were found 4 'NA' people. These are going to be considered as 'Single' too.

marital_status_update = {
    1: 'Married or couple', # Married and living with spouse
    2: 'Single',            # Seperated
    3: 'Single',            # Single
    4: 'Single',            # Divorced
    5: 'Single',            # Widowed
    -9: 'Single',           # 'DNA'
    -8: 'Single'            # 'NA'
}

In [None]:
df_persons_NTS['Marital_status'] = df_persons_NTS['Marital_status'].map(marital_status_update)

In [None]:
# check the unique updated values
e = df_persons_NTS['Marital_status'].unique()
print(sorted(e))

In [None]:
# Check the number of married or couple and single individuals in the dataset
df_persons_NTS.groupby('Marital_status').count()

In [None]:
df_persons_NTS['Marital_status'].dtypes

### Economic activity:

In [None]:
#check the values that column "Economic activity" has:
f = df_persons_NTS['Economic_activity'].unique()
print(sorted(f))

In [None]:
df_persons_NTS.groupby('Economic_activity').count()

#### Economic activity values need to be updated in order to match them with SPENSER data:

In [None]:
# Change values of the marital status

##  ((-9)'DNA') was observed that is related to people aged 0 - 15. So they are going to be considered as "Inactive-Child student".
## There were found 4 'NA' people. These are going to be considered as 'Single' too.

economic_activity_update = {
    1: 'Employed',                               # Employees: full-time
    2: 'Employed',                               # Employees: part-time
    3: 'Employed',                               # Self-employed: full-time
    4: 'Employed',                               # Self-employed: part-time
    5: 'Unemployed',                             # ILO unemployed
    6: 'Inactive Retired',                       # Economically inactive: Retired
    7: 'Inactive Student',                       # Economically inactive: Student
    8: 'Inactive Looking after home family',     # Economically inactive: Looking after family / home
    9: 'Inactive Sick',                          # Economically inactive: Permanently sick / disabled
    10: 'Inactive Sick',                         # Economically inactive: Temporarily sick / injured
    11: 'Inactive Other',                        # Economically inactive: Other
    -9: 'Inactive Child student'                 # 'DNA' (individuals aged 0-15)    
}

In [None]:
df_persons_NTS['Economic_activity'] = df_persons_NTS['Economic_activity'].map(economic_activity_update)

In [None]:
# check the unique updated values
f = df_persons_NTS['Economic_activity'].unique()
print(sorted(f))

In [None]:
df_persons_NTS['Economic_activity'].dtypes

In [None]:
# Check the number of married or couple and single individuals in the dataset
df_persons_NTS.groupby('Economic_activity').count()

### Income:

In [None]:
#check the values that column "Income" has:
g = df_persons_NTS['Income'].unique()
print(sorted(g))

In [None]:
type(g)

In [None]:
df_persons_NTS.groupby('Income').count()

##### Income values need to be updated in order to match them with SPENSER data:

#### First, generate a random income value for each agent based on the income band each one is located

In [None]:
Income_value = {
    1: random.randint(1,999),              # Less than 1000 
    2: random.randint(1000,1999),          # 1000- 1999
    3: random.randint(2000,2999),          # 2000- 2999
    4: random.randint(3000,3999),          # 3000- 3999
    5: random.randint(4000,4999),          # 4000- 4999
    6: random.randint(5000,5999),          # 5000- 5999
    7: random.randint(6000,6999),          # 6000- 6999
    8: random.randint(7000,7999),          # 7000- 7999
    9: random.randint(8000,8999),          # 8000- 8999
    10: random.randint(9000,9999),         # 9000- 9999
    11: random.randint(10000,12499),       # 10000- 12499
    12: random.randint(12500,14999),       # 12500- 14999
    13: random.randint(15000,17499),       # 15000- 17499
    14: random.randint(17500,19999),       # 17500- 19999
    15: random.randint(20000,24999),       # 20000- 24999
    16: random.randint(25000,29999),       # 25000- 29999
    17: random.randint(30000,34999),       # 30000- 34999
    18: random.randint(35000,39999),       # 35000- 39999
    19: random.randint(40000,49999),       # 40000- 49999
    20: random.randint(50000,59999),       # 50000- 59999
    21: random.randint(60000,69999),       # 60000- 69999
    22: random.randint(70000,74999),       # 70000- 74999
    23: random.randint(75000,99999),       # 75000 to 99999
    24: random.randint(100000,124999),     # 100000 to 124999
    25: random.randint(125000,149999),     # 125000 to 149999
    26: random.randint(150000,300000),     # 150000 or more
    -9: 0                                  # 'DNA' (individuals aged 0-15) 

}

##### Generate the random income value in the band

In [None]:
df_persons_NTS['Income_value'] = 0

for idx_person, person in df_persons_NTS.iterrows():

    if (person['Income'] ==  1):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(1,999)
    elif (person['Income'] ==  2):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(1000,1999)
    elif (person['Income'] ==  3):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(2000,2999)
    elif (person['Income'] ==  4):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(3000,3999)
    elif (person['Income'] ==  5):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(4000,4999)
    elif (person['Income'] ==  6):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(5000,5999)
    elif (person['Income'] ==  7):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(6000,6999)
    elif (person['Income'] ==  8):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(7000,7999)
    elif (person['Income'] ==  9):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(8000,8999)
    elif (person['Income'] ==  10):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(9000,9999)
    elif (person['Income'] ==  11):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(10000,12499)
    elif (person['Income'] ==  12):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(12500,14999)
    elif (person['Income'] ==  13):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(15000,17499)
    elif (person['Income'] ==  14):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(17500,19999)
    elif (person['Income'] ==  15):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(20000,24999)
    elif (person['Income'] ==  16):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(25000,29999)
    elif (person['Income'] ==  17):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(30000,34999)
    elif (person['Income'] ==  18):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(35000,39999)
    elif (person['Income'] ==  19):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(40000,49999)
    elif (person['Income'] ==  20):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(50000,59999)
    elif (person['Income'] ==  21):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(60000,69999)
    elif (person['Income'] ==  22):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(70000,74999)
    elif (person['Income'] ==  23):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(75000,99999)
    elif (person['Income'] ==  24):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(100000,124999)
    elif (person['Income'] ==  25):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(125000,149999)
    elif (person['Income'] ==  26):
        df_persons_NTS.at[idx_person,'Income_value'] = random.randint(150000,300000)
    elif (person['Income'] == -9):
        df_persons_NTS.at[idx_person,'Income_value'] = 0
    
    
        
        
        
        
        
    

In [None]:
# Select those people that has an income (income > 0)

df_persons_NTS_income = df_persons_NTS.loc[(df_persons_NTS['Income_value'] > 0)]

In [None]:
len(df_persons_NTS_income)

In [None]:
# Calculate the percentiles of those people earning more than 0
percentile_20 = int(df_persons_NTS_income.Income_value.quantile(0.2))
percentile_40 = int(df_persons_NTS_income.Income_value.quantile(0.4))
percentile_60 = int(df_persons_NTS_income.Income_value.quantile(0.6))
percentile_80 = int(df_persons_NTS_income.Income_value.quantile(0.8))
percentile_100 = int(df_persons_NTS_income.Income_value.quantile(1.0))

In [None]:
print(percentile_20)
print(percentile_40)
print(percentile_60)
print(percentile_80)
print(percentile_100)

In [None]:
# Income per groups

## Example: group_1 contains the 10% of those people with the lowest income (excluding those people which income = 0)
## Example: group_10 contains the 10% of those people with the highest income

df_persons_NTS['Income_group'] = ''


for idx_person, person in df_persons_NTS.iterrows():
    
    if (person['Income_value'] == 0.0):
        df_persons_NTS.at[idx_person,'Income_group'] = 'group_0'
    
    elif ((person['Income_value'] > 0) and 
          (person['Income_value'] <= percentile_20)):
        df_persons_NTS.at[idx_person,'Income_group'] = 'group_1'
    
    elif ((person['Income_value'] > percentile_20) and 
          (person['Income_value'] <= percentile_40)):
        df_persons_NTS.at[idx_person,'Income_group'] = 'group_2'  
        
    elif ((person['Income_value'] > percentile_40) and
          (person['Income_value'] <= percentile_60)):
        df_persons_NTS.at[idx_person,'Income_group'] = 'group_3'   
        
    elif ((person['Income_value'] > percentile_60) and 
          (person['Income_value'] <= percentile_80)):
        df_persons_NTS.at[idx_person,'Income_group'] = 'group_4'
        
    elif ((person['Income_value'] > percentile_80) and 
          (person['Income_value'] <= percentile_100)):
        df_persons_NTS.at[idx_person,'Income_group'] = 'group_5'
    


In [None]:
#check the values that column "Income_new" has:
d = df_persons_NTS['Income_group'].unique()
print(sorted(d))

In [None]:
# Check the number of married or couple and single individuals in the dataset
df_persons_NTS.groupby('Income_group').count()

In [None]:
df_persons_NTS['Income_group'].dtypes

In [None]:
df_persons_NTS['Income_group'].unique()

### Driving license:

In [None]:
# check the unique updated values
h = df_persons_NTS['Driving_license'].unique()
print(sorted(h))

In [None]:
# Check the number of married or couple and single individuals in the dataset
df_persons_NTS.groupby('Driving_license').count()

#### Driving license values need to be updated in order to match them with SPENSER data:

In [None]:
# Change values of the driving licence

## 'DNA' was observed that is related to people aged 0 - 15. So they are going to be considered as 'False'.
## There were found 106 'NA' (-8) people. These are going to be considered as 'False' too.

Driving_license_update = {
    1: True,          # Full - car / motorcycle
    2: True,          # Full - car only
    3: True,          # Full - car only (automatic)
    4: True,          # Full - car only (adapted)
    5: False,         # Full - motorcycle only
    6: False,         # Full - moped
    7: True,          # Full - invalid vehicle
    8: True,          # Full - no details
    9: True,          # Provisional - car / motorcycle
    10: True,         # Provisional - car
    11: True,         # Provisional - invalid car
    12: True,         # Provisional - other
    13: True,         # Provisional - no details
    14: False,        # None
    -9: False,        # 'DNA' (individuals aged 0-15) 
    -8: False         # 'NA' 

}

In [None]:
df_persons_NTS['Driving_license'] = df_persons_NTS['Driving_license'].map(Driving_license_update)

In [None]:
# check the unique updated values
h = df_persons_NTS['Driving_license'].unique()
print(sorted(h))

In [None]:
# Check the number of married or couple and single individuals in the dataset
df_persons_NTS.groupby('Driving_license').count()

In [None]:
df_persons_NTS['Driving_license'].dtypes

### Car access:

In [None]:
#check the values that column "Marital status" has:
i = df_persons_NTS['Car_access'].unique()
print(sorted(i))

In [None]:
# Check the number of married or couple and single individuals in the dataset
df_persons_NTS.groupby('Car_access').count()

#### Car access values need to be updated in order to match them with SPENSER data:

In [None]:
# Change values of the car access

## 'DNA' (-9) was observed that is related to people aged 0 - 15. So they are going to be considered as 'False'.
## There were found 146 'NA' (-8) people. These are going to be considered as 'False' too.

Car_access_update = {
    1: True,          # Main driver of company car
    2: True,          # Other main driver
    3: True,          # Not main driver of household car
    4: False,          # Household car but non driver
    5: False,         # Driver but no car
    6: False,         # Non driver and no car
    -9: False,        # 'DNA' (individuals aged 0-15) 
    -8: False         # 'NA' 

}

In [None]:
df_persons_NTS['Car_access'] = df_persons_NTS['Car_access'].map(Car_access_update)

In [None]:
# check the unique updated values
i = df_persons_NTS['Car_access'].unique()
print(sorted(i))

In [None]:
# Check the number of married or couple and single individuals in the dataset
df_persons_NTS.groupby('Car_access').count()

In [None]:
df_persons_NTS['Car_access'].dtypes

### Bike access:

In [None]:
#check the values that column "Marital status" has:
j = df_persons_NTS['Bike_access2018'].unique()
print(sorted(j))

In [None]:
# Check the number of married or couple and single individuals in the dataset
df_persons_NTS.groupby('Bike_access2018').count()

In [None]:
#check the values that column "Marital status" has:
k = df_persons_NTS['Bike_access2019'].unique()
print(sorted(k))

In [None]:
# Check the number of married or couple and single individuals in the dataset
df_persons_NTS.groupby('Bike_access2019').count()

#### Bike access (up to 2018) values need to be updated in order to match them with SPENSER data:

In [None]:
# Change values of the car access

## 'DNA' (-9) was observed that is related to people aged 0 - 15. So they are going to be considered as 'False'.
## There were found 146 'NA' (-8) people. These are going to be considered as 'False' too.

Bike_access_2018_update = {
    1: True,          # Own a bicycle yourself
    2: True,          # Have use of household bicycle
    3: True,          # Have use of non-household bicycle
    4: False,         # Have no use of a bicycle
    -10: 'no',        # DEAD
    -9: False,        # 'DNA' (individuals aged 0-15) 
    -8: False         # 'NA' 

}

In [None]:
df_persons_NTS['Bike_access2018'] = df_persons_NTS['Bike_access2018'].map(Bike_access_2018_update)

In [None]:
#check the values that column "Marital status" has:
j = df_persons_NTS['Bike_access2018'].unique()
print((j))

In [None]:
# Check the number of married or couple and single individuals in the dataset
df_persons_NTS.groupby('Bike_access2018').count()

#### Bike access (from 2019) values need to be updated in order to match them with SPENSER data:

In [None]:
# Change values of the car access

## 'DNA' (-9) was observed that is related to people aged 0 - 15. So they are going to be considered as 'False'.
## There were found 146 'NA' (-8) people. These are going to be considered as 'False' too.

Bike_access_2019_update = {
    1: True,          # Own a bicycle
    2: True,          # Have regular use of a bicycle owned by someone else
    3: False,         # Have no regular use of a bicycle
    -10: 'no',        # DEAD
    -9: False,        # 'DNA' (individuals aged 0-15) 
    -8: False         # 'NA' 

}

In [None]:
df_persons_NTS['Bike_access2019'] = df_persons_NTS['Bike_access2019'].map(Bike_access_2019_update)

In [None]:
#check the values that column "Marital status" has:
k = df_persons_NTS['Bike_access2019'].unique()
print((k))

In [None]:
# Check the number of married or couple and single individuals in the dataset
df_persons_NTS.groupby('Bike_access2019').count()

##### Merge both columns into one.

In [None]:
for idx_person, person in df_persons_NTS.iterrows():
    
    if person['Bike_access2018'] == 'no':
        df_persons_NTS.at[idx_person,'Bike_access'] = person['Bike_access2019']
        
    if person['Bike_access2019'] == 'no':
        df_persons_NTS.at[idx_person,'Bike_access'] = person['Bike_access2018']
        

In [None]:
# Check the number of married or couple and single individuals in the dataset
df_persons_NTS.groupby('Bike_access').count()

In [None]:
#check the values that column "Marital status" has:
l = df_persons_NTS['Bike_access'].unique()
print(sorted(l))

In [None]:
df_persons_NTS = df_persons_NTS.replace({'True': True, 'False': False})

In [None]:
df_persons_NTS['Bike_access'].dtypes

### Calculate the total people in household and total children in household in order to identify individuals with children dependency

In [None]:
#Create new columns in the dataframe:

#Column with the total amount of people in the household
df_persons_NTS["Total_People_in_household"] = 0

#Column with the total amoun of children in the household
df_persons_NTS["Total_Children_in_household"] = 0

In [None]:
# Create a list with all Households unique ID values
HouseholdID_list = df_persons_NTS['HouseholdID'].unique().tolist()

#Create an empty list where the small blocks of dataframes will be stored
df_persons_NE_OA_HID_temp = []

#Create a variable that counts the number of households iterated
household_counter = 0

for HID_AreaOA in HouseholdID_list:
    #Increase the value of the household_counter in 1
    household_counter += 1
    print("Number of HOUSEHOLD in iteration: ", (household_counter, len(HouseholdID_list)))
    
    
    #Get only the PERSONS that belong to the same HID_AreaOA
    persons_in_household = df_persons_NTS.loc[df_persons_NTS['HouseholdID'] == HID_AreaOA]
    #print(df_persons_NE_OA_HID)

    ##Do the calculus just HOUSEHOLD BY HOUSEHOLD
    for idx_person_1, person_1 in persons_in_household.iterrows():
        count_people = 1
        if person_1['Age'] < 16:
            count_children = 1
        else:
            count_children = 0
        for idx_person_2, person_2 in persons_in_household.iterrows():
            #If person_1 is different to person_2:
            if (person_1['IndividualID'] != person_2['IndividualID']):
                count_people += 1
                #If person_1 is older than 16
                if person_2['Age'] < 16:
                    count_children += 1
                     
        #Update values in the person's row            
        persons_in_household.at[idx_person_1,'Total_People_in_household'] = count_people
        persons_in_household.at[idx_person_1,'Total_Children_in_household'] = count_children
                        


    #Append the dataframe into the temporal list
    df_persons_NE_OA_HID_temp.append(persons_in_household)
            
  
        
#concatenate all persons (lists of the 'df_persons_NE_OA_HID_temp' list) in one dataframe
df_persons_NE_Household_composition = pd.concat(df_persons_NE_OA_HID_temp, axis=0, ignore_index=True)


In [None]:
df_persons_NTS = df_persons_NE_Household_composition

In [None]:
#Column showing if an adult has children dependency
df_persons_NTS["Children_dependency"] = np.nan
df_persons_NTS["Children_dependency"] = df_persons_NTS["Children_dependency"].astype('bool')   
df_persons_NTS["Children_dependency"] = False

In [None]:
def children_dependency(Age, Total_Children_in_household):
    if ((Age >= 18) and (Total_Children_in_household > 0)):
        Children_dependency = True
    else:
        Children_dependency = False
    return Children_dependency

In [None]:
# Run the lambda function "Children_dependency" to identify which adults have children dependencies
df_persons_NTS['Children_dependency'] = df_persons_NTS.apply(lambda x: children_dependency(x['Age'], x['Total_Children_in_household']), axis=1)

In [None]:
#check the values that column "Marital status" has:
m = df_persons_NTS['Children_dependency'].unique()
print(sorted(m))


In [None]:
df_persons_NTS['Children_dependency'].dtypes

### Create a new dataframe containing only those columns that are relevant

In [None]:
df_person_NTS = df_persons_NTS[['IndividualID', 'HouseholdID', 'PSUID', 'SurveyYear',
                       'Age', 'Sex', 'Marital_status', 'Children_dependency', 
                       'Total_People_in_household', 'Total_Children_in_household',
                       'Economic_activity', 'Occupation', 'Income', 'Income_group',
                       'Driving_license', 'Car_access', 'Bike_access']]

### Check the type of data in each column:

In [None]:
df_person_NTS.dtypes

### Save the dataframe as csv file

In [None]:
# Save the data as csv file:
df_person_NTS_export_20220326_latest = df_person_NTS

df_person_NTS_export_20220326_latest.to_csv(r'C:\Users\b9055315\PhD_project\UK_Data_Service\NTS\Generated_data_from_code\df_person_NTS_export_20220326_latest.csv')

In [None]:
d = df_person_NTS['Income_group'].unique()
print(sorted(d))

In [None]:
# Save the data as csv file:
df_days_NTS_export_20220311 = df_days_NTS

df_days_NTS_export_20220311.to_csv(r'C:\Users\b9055315\PhD_project\UK_Data_Service\NTS\Generated_data_from_code\df_days_NTS_export_20220311.csv')

In [None]:
# Save the data as csv file:
df_psu_NTS_export_20220311 = df_psu_NTS

df_psu_NTS_export_20220311.to_csv(r'C:\Users\b9055315\PhD_project\UK_Data_Service\NTS\Generated_data_from_code\df_psu_NTS_export_20220311.csv')

In [None]:
df_person_NTS

## Read trip file

In [None]:
trip_dir = r'' # use your path


df_trip = pd.read_csv(
    trip_dir,
    sep='\t',
    usecols=['SurveyYear',
             'TripID',
             'DayID',
             'IndividualID',
             'HouseholdID',
             'PSUID', 
             'NumStages',
             'MainMode_B03ID',
             'MainMode_B11ID',
             'TripPurpose_B02ID',
             'TripPurpose_B04ID',
             ],
#     dtype={"W5": np.float64,}
)


In [None]:
len(df_trip)

In [None]:
df_trip_NTS = df_trip.loc[(df_trip['IndividualID'].isin(days_persons_NTS_list)) &
                          (df_trip['PSUID'].isin(days_PSU_NTS_list)) &
                         (df_trip['SurveyYear'] >= 2011) &
                         (df_trip['SurveyYear'] <= 2019)]


In [None]:
len(df_trip_NTS)

In [None]:
df_trip_NTS.head(10)

In [None]:
df_trip_NTS['SurveyYear'].min()

In [None]:
df_trip_NTS['SurveyYear'].max()

In [None]:
# Save the data as csv file:
df_trip_NTS_export_20220326 = df_trip_NTS

df_trip_NTS_export_20220326.to_csv(r'') # use your file path

In [None]:
df_trip_NTS.loc[(df_trip_NTS['TripPurpose_B04ID'] == 3)].sort_values('IndividualID')

In [None]:
df_trip_NTS.loc[
               (df_trip_NTS['HouseholdID'] == 2019006333)].sort_values(['IndividualID', 'DayID'], ascending=[True, True])
