In [1]:
# Import packages 
import pandas as pd
import numpy as np
# Set Working Directory
import os
# os.chdir(r'C:\Personal\IMM') # absolute path, using \ and r prefix
# wd = os.getcwd()

import json
from pprint import pprint
import timeit
from decimal import *
import numba
from numba import jit
import traceback

import itertools

import functools
from balsa.matrices import read_mdf, to_mdf, to_fortran, read_fortran_square, read_fortran_rectangle

seed= 12345

# Directory Listing

In [2]:
dirListing = 'c:\\personal\\IMM'
dirListing1 = 'c:\\personal\\IMM\\Offpeak'

dirListing_other_pk = 'c:\\personal\\IMM\\Other Trips\\peak'
fpath_pk = os.listdir(dirListing_other_pk)

dirListing_other_offpk = 'c:\\personal\\IMM\\Other Trips\\offpeak'
fpath_offpk = os.listdir(dirListing_other_offpk)

#### Bring in the ggh zone definitions

In [3]:
# batch in ggh zone numbers and add in two columns for i and j zones
ggh = pd.read_csv(os.path.join(dirListing, "GGH_zones.csv"))
if 'ggh_zone'  not in ggh:
    print("The 'ggh_zone' column does not exist.")

# Batch in the Other Trips i.e. non-hbw/hbs/hbu

In [4]:
# Now append the peak csv file names
BinaryMat_other_pk = []
for item in fpath_pk:
    if item.endswith(".bin"):
        fname = item
        fname = fname.split('.')[0]  # get the name only
        BinaryMat_other_pk.append(fname)

# Now append the off peak csv file names
BinaryMat_other_offpk = []
for item in fpath_offpk:
    if item.endswith(".bin"):
        fname = item
        fname = fname.split('.')[0]  # get the name only
        BinaryMat_other_offpk.append(fname)

In [5]:
BinaryMat_other_pk

['trips_peak_all_modes_hbm_insuff_high',
 'trips_peak_all_modes_hbm_insuff_low',
 'trips_peak_all_modes_hbm_nocar_high',
 'trips_peak_all_modes_hbm_nocar_low',
 'trips_peak_all_modes_hbm_suff_high',
 'trips_peak_all_modes_hbm_suff_low',
 'trips_peak_all_modes_hbo_insuff_high',
 'trips_peak_all_modes_hbo_insuff_low',
 'trips_peak_all_modes_hbo_nocar_high',
 'trips_peak_all_modes_hbo_nocar_low',
 'trips_peak_all_modes_hbo_suff_high',
 'trips_peak_all_modes_hbo_suff_low',
 'trips_peak_all_modes_nhb_all_segments',
 'trips_peak_all_modes_wbo_insuff_high',
 'trips_peak_all_modes_wbo_insuff_low',
 'trips_peak_all_modes_wbo_nocar_high',
 'trips_peak_all_modes_wbo_nocar_low',
 'trips_peak_all_modes_wbo_suff_high',
 'trips_peak_all_modes_wbo_suff_low']

# Bring in the Trips and household file

In [6]:
# set dictionary of Dtypes
dtype_trips = {}
dtype_hh = {}

# dictionary for storing hhold column dtypes
dtype_hh = {'hhid':'int32',
              'taz':'int16',
              'hhinc':'int32',
              'dtype':'int8',
              'hhsize':'int8',
              'nveh':'int8',
              'auto_suff': 'int8',
              'segment': 'int8',
              'segment1': 'int8'}

# dictionary for storing trips column dtypes
dtype_trips = {'hhid':'int32',
              'pid':'int8',
              'tour_id':'int8',
              'subtour_id':'int8',
              'trip_id':'int8',
              'activity_i':'category',
              'activity_j':'category',
              'taz_i': 'int16',
              'taz_j': 'int16',
              'tour_direction':'category',
              'purpose': 'category',
              'trip_direction': 'category',
              'peak_factor': 'float64'}


In [7]:
%%time
trips = pd.read_csv(r"c:\personal\IMM\trips_out.csv")
hh = pd.read_csv(r"c:\personal\IMM\households_out.csv")

Wall time: 25.9 s


In [8]:
%%time
# Add in market segment definition. We need two segments - one that defines the 6
# and second that is a dummy for use in the univ, school, NHB
hh.loc[(hh['hhinc'] <= 60000) & (hh['auto_suff'] == 0), 'segment'] = 1
hh.loc[(hh['hhinc'] > 60000) & (hh['auto_suff'] == 0), 'segment'] = 2
hh.loc[(hh['hhinc'] <= 60000) & (hh['auto_suff'] == 1), 'segment'] = 3
hh.loc[(hh['hhinc'] > 60000) & (hh['auto_suff'] == 1), 'segment'] = 4
hh.loc[(hh['hhinc'] <= 60000) & (hh['auto_suff'] == 2), 'segment'] = 5
hh.loc[(hh['hhinc'] > 60000) & (hh['auto_suff'] == 2), 'segment'] = 6

hh['segment1'] = 1

Wall time: 347 ms


In [9]:
%%time
# set dtypes for trips df
for key, value in dtype_trips.items():
    trips[key] = trips[key].astype(value)

# set dtypes for hhold df
for key, value in dtype_hh.items():
    hh[key] = hh[key].astype(value)

Wall time: 8.53 s


In [10]:
def concat_df(df1, df2, num):

    """
    A function to concatenate two dataframes by columns
    :param dataframe 1
    :param dataframe 2
    :param axis i.e 0 for row and 1 for column
    :return concatenated df
    """
    # once sampled, now concatenate the information back to the household dataframe
    df1.reset_index(drop=True, inplace=True)
    df2.reset_index(drop=True, inplace=True)
    df1 = df1.loc[:,~df1.columns.duplicated()]
    df2 = df2.loc[:,~df2.columns.duplicated()]
    df1 = pd.concat([df1, df2], axis=num)

    return df1

In [11]:
%%time
seed1 = 10
np.random.seed(seed1)
random = pd.DataFrame(np.random.uniform(size = len(trips)))
random.columns = ['rnum']

# attach the random number generator and calculate peak_flag. A value of 1 in this flag
# means that this is a peak period trip record.
trips =concat_df(trips, random, 1)
trips['peak_flag'] = np.where(trips['rnum'] <= trips['peak_factor'], 1, 0)
trips[['peak_flag']] = trips[['peak_flag']].astype('int8')   # save some memory

Wall time: 1.76 s


In [12]:
trips.head()

Unnamed: 0,hhid,pid,tour_id,subtour_id,trip_id,activity_i,activity_j,taz_i,taz_j,tour_direction,purpose,trip_direction,peak_factor,rnum,peak_flag
0,1,0,0,-1,0,home,work,1001,1015,outbound,HBW,outbound,0.777,0.771321,1
1,1,0,0,-1,1,work,home,1015,1001,inbound,HBW,inbound,0.777,0.020752,1
2,2,0,0,-1,0,home,work,1001,1035,outbound,HBW,outbound,0.777,0.633648,1
3,2,0,0,-1,1,work,home,1035,1001,inbound,HBW,inbound,0.777,0.748804,1
4,2,0,0,0,0,work,business,1035,0,outbound,WBO,outbound,0.603,0.498507,1


In [13]:
trips1 = trips.loc[(trips['purpose'] == 'HBM') & (trips['peak_flag'] == 1)]
trips1.head()

Unnamed: 0,hhid,pid,tour_id,subtour_id,trip_id,activity_i,activity_j,taz_i,taz_j,tour_direction,purpose,trip_direction,peak_factor,rnum,peak_flag
86,18,0,0,-1,2,shop,home,0,1001,inbound,HBM,inbound,0.392,0.303063,1
117,25,0,0,-1,4,shop,home,0,1001,inbound,HBM,inbound,0.281,0.178903,1
162,34,0,1,-1,0,home,shop,1001,0,outbound,HBM,outbound,0.281,0.030685,1
205,43,0,1,-1,0,home,shop,1001,0,outbound,HBM,outbound,0.281,0.257303,1
266,57,0,0,-1,3,shop,home,0,1001,inbound,HBM,inbound,0.281,0.046739,1


# This block of code was created to see if time can be saved in processing the peak factor.

In [206]:
mand = ['work', 'school', 'university']

In [221]:
test = trips.loc[(trips['activity_i'] == 'home') & (trips['activity_j'].isin (mand)) | (trips['activity_j'] == 'home') ]
test.shape

(12915016, 13)

In [223]:
test1 = test.loc[test['hhid'] < 10000]
test1.shape

(34072, 13)

In [224]:
seed1 = 10
np.random.seed(seed1)
random = pd.DataFrame(np.random.uniform(size = len(test1)))
random.columns = ['rnum']

# attach the random number generator and calculate peak_flag. A value of 1 in this flag
# means that this is a peak period trip record.
test1 =concat_df(test1, random, 1)
test1['peak_flag'] = np.where((1 - test1['peak_factor']) > test1['rnum'], 0, 1)
test1[['peak_flag']] = test1[['peak_flag']].astype('int8')   # save some memory

home_end = ['home']
trips['peak_flag'] = np.where((~trips['activity_i'].isin(home_end)) & (~trips['activity_j'].isin(home_end)), 10, trips['peak_flag'])

## Code for fixing Peak Consistency

In [14]:
%%time

test1 = trips.loc[trips['hhid'] < 19]
loop = "Close"
start_peak_flag = None

           
for index, current_row in test1.iterrows():        
    if ((current_row['activity_i'] == 'home') & (current_row['activity_j'] in ['work','school', 'univerity'])):
        loop = "Start"
        start_peak_flag = current_row['peak_flag']
        

    if ((current_row['activity_j'] == 'home')):
        if (loop == "Start"):
            test1.set_value(index, 'peak_flag', start_peak_flag)
            loop = "Close"
            start_peak_flag = None




Wall time: 41 ms


## Code to transfer zonal information down the chain

In [292]:
%%time
## row level taz_i and j cleanup

for index, current_row in test.iterrows():        
    if (current_row['taz_i'] != 0 & current_row['taz_j'] ==0 ) :                
        new_taz_j = np.random.randint(0,10000)
        test.iat[index,next_row.columns.get_loc('taz_j')] = new_taz_j
        
        
    if (current_row['taz_i'] == 0) & (current_row['taz_j'] == 0):
        test.iat[index,next_row.columns.get_loc('taz_i')] = new_taz_j        
        new_taz_j = np.random.randint(0,10000)
        test.iat[index,next_row.columns.get_loc('taz_j')] = new_taz_j           
            
    if (current_row['taz_i'] == 0) & (current_row['taz_j'] != 0):
        test.iat[index,next_row.columns.get_loc('taz_i')] = new_taz_j        
    

Wall time: 7.14 s


In [173]:
test1.set_value(index, 'peak_flag', start_peak_flag)

g


In [148]:
row[1]['activity_i']

'business'

# Build the peak and non-peak trips dataframe dictionary

### Functions

In [143]:
### Function to convert from bin to dataframe and set indices. Also unstack the dataframe and rename columns
def convert_df(location, name, nzones):
    '''
    
    '''
    # read in the fortran dataframe and then subset it for the internal zones
    # in the GGH.
    df = read_fortran_rectangle(os.path.join(location, name + ".bin"), n_columns = 4000, tall = False, reindex_rows = False, fill_value = None)
    df1 = pd.DataFrame(df).iloc[:nzones, :nzones]
    
    # set column and row indices
    df1.rename(columns = ggh['ggh_zone'], inplace = True )
    df1.set_index(ggh['ggh_zone'], inplace = True)
    
    # Now unstack and rename columns
    df1 = df1.unstack().reset_index()
    df1.columns = ['origin', 'destination', 'trips']
    
    # dictionary of market segment key and values
    market_seg_def = {
        'nocar_low': 1,
        "nocar_high": 2,
        "insuff_low": 3,
        "insuff_high": 4,
        "suff_low": 5,
        "suff_high": 6,
        "all_segments": 10
    }

    # Remove zero trips and add in market segmentation and peak-offpeak flag
    df1 = df1.loc[df1['trips'] != 0]
    segment = name.split('_')
    s1 = segment[5] + '_' + segment[6]
    df1['market_seg'] = s1
    df1['mseg'] = df1['market_seg'].map(market_seg_def)
    df1['period'] = segment[1]
    df1.drop('market_seg', axis=1, inplace=True)
    
    # Also add in the rounder up trips values
    df1['wholetrips'] = round(df1['trips']).astype(int)
    df1 = df1.loc[df1['wholetrips'] > 0]
           
    
    return(df1)

In [142]:
# set dictionary of Dtypes
df_trips_structure = {}

# dictionary for storing column dtypes
df_trips_structure = {'origin':'int16',
              'destination':'int16',
              'trips':'float32',
              'mseg': 'int16',
              'wholetrips': 'int16',
              'period': 'category'}

In [163]:
%%time
## peak period dataframe for Other Trips
other_pk = {}
other_offpk = {}
all_other = {}

for name in BinaryMat_other_pk:
    all_other[name] = convert_df(dirListing_other_pk, name, 3262)
    print(name)
    
    # reset column types
    for key, value in df_trips_structure.items():
        all_other[name][key] = all_other[name][key].astype(value)

for name in BinaryMat_other_offpk:
    all_other[name] = convert_df(dirListing_other_offpk, name, 3262)
    print(name)
    
    # reset column types
    for key, value in df_trips_structure.items():
        all_other[name][key] = all_other[name][key].astype(value)



trips_peak_all_modes_hbm_insuff_high
trips_peak_all_modes_hbm_insuff_low
trips_peak_all_modes_hbm_nocar_high
trips_peak_all_modes_hbm_nocar_low
trips_peak_all_modes_hbm_suff_high
trips_peak_all_modes_hbm_suff_low
trips_peak_all_modes_hbo_insuff_high
trips_peak_all_modes_hbo_insuff_low
trips_peak_all_modes_hbo_nocar_high
trips_peak_all_modes_hbo_nocar_low
trips_peak_all_modes_hbo_suff_high
trips_peak_all_modes_hbo_suff_low
trips_peak_all_modes_nhb_all_segments
trips_peak_all_modes_wbo_insuff_high
trips_peak_all_modes_wbo_insuff_low
trips_peak_all_modes_wbo_nocar_high
trips_peak_all_modes_wbo_nocar_low
trips_peak_all_modes_wbo_suff_high
trips_peak_all_modes_wbo_suff_low
trips_offpeak_all_modes_hbm_insuff_high
trips_offpeak_all_modes_hbm_insuff_low
trips_offpeak_all_modes_hbm_nocar_high
trips_offpeak_all_modes_hbm_nocar_low
trips_offpeak_all_modes_hbm_suff_high
trips_offpeak_all_modes_hbm_suff_low
trips_offpeak_all_modes_hbo_insuff_high
trips_offpeak_all_modes_hbo_insuff_low
trips_offpeak

# Code that slices dictionaries

In [29]:
# Home-based market peak
n=6
hbm_peak_allsegs = {k: other_pk[k] for k in list(other_pk.keys())[:n]}
hbm_peak_allsegs = pd.concat(hbm_peak_allsegs.values(), ignore_index=True)
hbm_peak_allsegs = interchange_reduction(hbm_peak_allsegs)

# Home-based other peak
n=12
hbo_peak_allsegs = {k: other_pk[k] for k in list(other_pk.keys())[6:n]}
hbo_peak_allsegs = pd.concat(hbo_peak_allsegs.values(), ignore_index=True)
hbo_peak_allsegs = interchange_reduction(hbo_peak_allsegs)

# NHB
n=13
nhb_peak_allsegs = {k: other_pk[k] for k in list(other_pk.keys())[12:n]}
nhb_peak_allsegs = pd.concat(nhb_peak_allsegs.values(), ignore_index=True)
nhb_peak_allsegs = interchange_reduction(nhb_peak_allsegs)

# Work-based other
n=19
wbo_peak_allsegs = {k: other_pk[k] for k in list(other_pk.keys())[13:n]}
wbo_peak_allsegs = pd.concat(wbo_peak_allsegs.values(), ignore_index=True)
wbo_peak_allsegs = interchange_reduction(wbo_peak_allsegs)

In [68]:
trips_hhold = pd.merge(trips, hh, on = 'hhid', how = 'left')

In [792]:
test = trips_hhold.loc[(trips['hhid'] < 10000) & (trips['purpose'] != 'HBE') ]
# test = trips_hhold.loc[trips['hhid'] == 164]
print(test.shape)
test

(54561, 23)

In [681]:
prng = np.random.RandomState(3)
prng

<mtrand.RandomState at 0xd1dc0510>

In [787]:
def first_dest(trips, current_row, all_othertrips):
    """
    
    """
    # check if input is not a series and make it into one.
    if isinstance(current_row, pd.DataFrame):
        current_row = current_row.T.squeeze()
    
    # create the key using the purpose and market segment. Then get the dataframe that belongs to the key
    # Now subset the dataframe by origin being evaluated. This is required for accurately sampling
    df_choose_flag = current_row['purpose'] + '_' + str(current_row['segment'])
    t1 = all_othertrips[dict.get(df_choose_flag)]
    zone_loc = current_row['taz_i']    # save current_row origin taz
    t1_loc = t1.loc[t1['origin'] == zone_loc]
    
    
    # if the t1_loc df is empty it means that the Dest Choice model did not produce a trip from that origin
    # for the origin, time period, and market segment in question. One needs to pick another zone
    counter = 1
    while len(t1_loc) == 0:
        
        newzone_loc = zone_loc+counter
        t1_loc = t1.loc[t1['origin'] == newzone_loc]
        counter += 1

    # sample for a destination 
    sampled_dest = t1_loc.sample(n=1, weights=t1['wholetrips'],  replace=True, random_state = prng)
    return sampled_dest.iat[0,1]

### dictionary of TripPurpo

In [417]:
dict = {
    'HBM_1': 'trips_peak_all_modes_hbm_nocar_low',
    'HBM_2': 'trips_peak_all_modes_hbm_nocar_high',
    'HBM_3': 'trips_peak_all_modes_hbm_insuff_low',
    'HBM_4': 'trips_peak_all_modes_hbm_insuff_high',
    'HBM_5': 'trips_peak_all_modes_hbm_suff_low',
    'HBM_6': 'trips_peak_all_modes_hbm_suff_high',
    'HBO_1': 'trips_peak_all_modes_hbo_nocar_low',
    'HBO_2': 'trips_peak_all_modes_hbo_nocar_high',
    'HBO_3': 'trips_peak_all_modes_hbo_insuff_low',
    'HBO_4': 'trips_peak_all_modes_hbo_insuff_high',
    'HBO_5': 'trips_peak_all_modes_hbo_suff_low',
    'HBO_6': 'trips_peak_all_modes_hbo_suff_high',
    'NHB_1': 'trips_peak_all_modes_nhb_all_segments',
    'NHB_2': 'trips_peak_all_modes_nhb_all_segments',
    'NHB_3': 'trips_peak_all_modes_nhb_all_segments',
    'NHB_4': 'trips_peak_all_modes_nhb_all_segments',
    'NHB_5': 'trips_peak_all_modes_nhb_all_segments',
    'NHB_6': 'trips_peak_all_modes_nhb_all_segments',
    'WBO_1': 'trips_peak_all_modes_wbo_nocar_low',
    'WBO_2': 'trips_peak_all_modes_wbo_nocar_high',
    'WBO_3': 'trips_peak_all_modes_wbo_insuff_low',
    'WBO_4': 'trips_peak_all_modes_wbo_insuff_high',
    'WBO_5': 'trips_peak_all_modes_wbo_suff_low',
    'WBO_6': 'trips_peak_all_modes_wbo_suff_high'    
}

In [793]:
%%time
## row level taz_i and jassignment

for index, current_row in test.iterrows():  
    
    # this first condition is the start of the loop
    if (current_row['taz_i'] > 0) & (current_row['taz_j'] == 0) : 
        
        # get the zone by sampling from the requisite trip purpose, time period and market segment 
        # and assign as the destination
        new_taz_j = first_dest(test, current_row, all_other)
        test.set_value(index,'taz_j', new_taz_j)
    
    # this is the next condition the destination is known but not the origin
    # the destination from the previous row becomes the origin
    if (current_row['taz_i'] == 0) & (current_row['taz_j'] != 0):
        test.set_value(index,'taz_i', new_taz_j) 
        
    # final condition where neither origin or destination is defined
    # in this case the destination from the previous row becomes the origin
    # and a destination is sampled from the requisite trip purpose, time period and market segment
    if (current_row['taz_i'] == 0) & (current_row['taz_j'] == 0):

        test.set_value(index,'taz_i', new_taz_j) 
        test_current = test.loc[[index]]

        new_taz_j = first_dest(test, test_current, all_other)
        test.set_value(index,'taz_j', new_taz_j)           
            
 

Wall time: 1min 7s


In [794]:
test

Unnamed: 0,hhid,pid,tour_id,subtour_id,trip_id,activity_i,activity_j,taz_i,taz_j,tour_direction,...,rnum,peak_flag,taz,hhinc,dtype,hhsize,nveh,auto_suff,segment,segment1
0,1,0,0,-1,0,home,work,1001,1015,outbound,...,0.771321,1,1001,110000,5,1,1,2,6,1
1,1,0,0,-1,1,work,home,1015,1001,inbound,...,0.020752,1,1001,110000,5,1,1,2,6,1
2,2,0,0,-1,0,home,work,1001,1035,outbound,...,0.633648,1,1001,36000,6,1,1,2,5,1
3,2,0,0,-1,1,work,home,1035,1001,inbound,...,0.748804,1,1001,36000,6,1,1,2,5,1
4,2,0,0,0,0,work,business,1035,4169,outbound,...,0.498507,1,1001,36000,6,1,1,2,5,1
5,2,0,0,0,1,business,other,4169,4149,inbound,...,0.224797,1,1001,36000,6,1,1,2,5,1
6,2,0,0,0,2,other,other,4149,4129,inbound,...,0.198063,1,1001,36000,6,1,1,2,5,1
7,2,0,0,0,3,other,other,4129,4169,inbound,...,0.760531,0,1001,36000,6,1,1,2,5,1
8,2,0,0,0,4,other,work,4169,1035,inbound,...,0.169111,1,1001,36000,6,1,1,2,5,1
9,4,0,0,-1,0,home,work,1001,1129,outbound,...,0.088340,1,1001,116000,5,3,1,1,4,1


In [755]:
current_row = test.iloc[0]
df_choose_flag = current_row['purpose'] + '_' + current_row['segment'].astype(str) 
df_get_dict = dict.get(df_choose_flag)

t1 = all_other[dict.get(df_choose_flag)]
zone_loc = current_row['taz_i']    # save current_row origin taz
t1_loc = t1.loc[t1['origin'] == zone_loc]

counter = 1
while len(t1_loc) == 0:
        
        newzone_loc = zone_loc+counter
        t1_loc = t1.loc[t1['origin'] == newzone_loc]
        counter += 1
# if len(t1_loc)== 0:
#     t1_loc = t1.loc[t1['origin']==1003]

sample_rand = t1_loc.sample(n=1, weights=t1['wholetrips'], random_state=seed, replace=True)
sample_rand.iat[0,1]




1081

In [754]:
newzone_loc

1003

In [751]:
t1_loc

Unnamed: 0,origin,destination,trips,mseg,period,wholetrips
6534,1003,1011,0.635713,2,peak,1
6535,1003,1012,0.550162,2,peak,1
6604,1003,1081,0.637753,2,peak,1


In [94]:
number_samples = int(df['trips'].sum()-df1['RevTrips'].sum())
sample_df = df1.sample(n=number_samples, replace=True, weights=df1['RevTrips'], random_state=12345)
sample_df = sample_df[['origin', 'destination', 'period', 'RevTrips']]
sample_df['RevTrips'] = 1
sample_df.head()

Unnamed: 0,origin,destination,period,RevTrips
9260394,8600,8600,peak,1
2228623,2053,2047,peak,1
1347598,1414,1393,peak,1
1553198,1477,1487,peak,1
4972783,4086,4057,peak,1


In [None]:
for k,v in other_pk.items():
    print(len(v.groupby(['origin', 'destination']).size()))

# Bucket round

In [14]:
def bucket_rounding(df, newround_f, residual_f):
    """
    This function bucket rounds a dataframe given a set of values in a column.
    
    Arguments: dataframe, first rounded value, and first residual
    """
    newround_f.append((df['trips'].values[i] + residual_f[i-1]).round())
    residual_f.append(df['trips'].values[i] + residual_f[i-1] - newround_f[i]) 
    
    return (newround_f)

In [35]:
#other_pk1 = dict(list(other_pk.items())[:1])

### Constrained over the entire dataframe

#### Peak Period

In [50]:
%%time
# collect the dataframes
finaldf_other_pk = {}

# run FOR loop for each of the dataframes in the dictionary
for name in other_pk.keys():
        
    # get df from dictionary
    finaldf_other_pk[name] = other_pk[name]
      
    #create empty lists
    newround_pk_f = []
    newround_pk_s = []
    residual_pk_f = []
    residual_pk_s = []
        
    # get the first row values
    newround_pk_f = [finaldf_other_pk[name].iat[0,2].round()]
    residual_pk_f = [finaldf_other_pk[name].iat[0,2] - newround_pk_f[0]]
    
    for i in range(1, len(finaldf_other_pk[name].index)):
                            
        # carry out bucket rounding
        bucket_rounding(finaldf_other_pk[name], newround_pk_f, residual_pk_f)           
    
    # cbind the final trips and only keep rows greater than zero.
    finaldf_other_pk[name]['finaltrips'] = newround_pk_f


Wall time: 19min 8s


In [51]:
for k, v in finaldf_other_pk.items():
    print(repr(finaldf_other_pk[k]['trips'].sum()), finaldf_other_pk[k]['finaltrips'].sum())

188397.98 188398.0
79616.758 79617.0
18276.752 18277.0
50443.402 50443.0
172711.42 172712.0
110728.1 110728.0
406142.56 406143.0
161784.52 161785.0
40770.273 40770.0
107369.13 107369.0
379535.31 379535.0
224161.11 224161.0
933184.5 933185.0
728631.88 728632.0
186148.64 186149.0
61102.043 61102.0
62906.723 62907.0
670073.19 670073.0
252038.36 252038.0


#### Off-peak period

In [54]:
%%time
# collect the dataframes
finaldf_other_offpk = {}

# run FOR loop for each of the dataframes in the dictionary
for name in other_offpk.keys():
        
    # get df from dictionary
    finaldf_other_offpk[name] = other_offpk[name]
    
    #create empty lists
    newround_pk_f = []
    newround_pk_s = []
    residual_pk_f = []
    residual_pk_s = []
        
    # get the first row values
    newround_pk_f = [finaldf_other_offpk[name].iat[0,2].round()]
    residual_pk_f = [finaldf_other_offpk[name].iat[0,2] - newround_pk_f[0]]
    
    for i in range(1, len(finaldf_other_offpk[name].index)):
                            
        # carry out bucket rounding
        bucket_rounding(finaldf_other_offpk[name], newround_pk_f, residual_pk_f)           
    
    # cbind the final trips and only keep rows greater than zero.
    finaldf_other_offpk[name]['finaltrips'] = newround_pk_f

Wall time: 17min 19s


In [55]:
for k, v in finaldf_other_offpk.items():
    print(repr(finaldf_other_offpk[k]['trips'].sum()), finaldf_other_offpk[k]['finaltrips'].sum())

354075.0 354075.0
167898.19 167898.0
36202.254 36202.0
116856.59 116857.0
318137.47 318137.0
232781.91 232782.0
688581.63 688581.0
274292.41 274292.0
59954.723 59955.0
164714.03 164714.0
643470.63 643471.0
380047.13 380047.0
1310048.8 1310048.0
479712.84 479713.0
122555.43 122555.0
40227.992 40228.0
41416.25 41416.0
441158.72 441159.0
165935.7 165936.0


In [12]:
### Function that acts upon each of the df in the folder
def expand_df(dfrepeat, colsrepeat):
    '''
    This function prepares every dataframe in the folder repeating the dataframe \
    rows using a user defined column. 
    
    Arguments: Dataframe and column that contains value to repeat rows
    
    Return: expanded dataframe 
    
    '''
    if (dfrepeat[colsrepeat].dtype == np.float64):
        dfrepeat[colsrepeat] = dfrepeat[colsrepeat].astype(int)
        df1 = dfrepeat.loc[np.repeat(dfrepeat.index.values, dfrepeat[colsrepeat])]
            
    return(df1)

In [16]:
%%time
# Merge the hholds info to the trips. By doing so, we can bring in a bunch of household attributes
# including income, dwelling type, size, number of vehicles, and auto_sufficiency. Add in an integer 
# definition for one of six market segments.
trips_hhold = pd.merge(trips, hh, how = 'left', left_on = 'hhid', right_on = 'hhid')

Wall time: 7.96 s


In [None]:
dirListing1 = 'c:\\personal\\IMM\\Other Trips\\offpeak'
hbm = read_fortran_rectangle(os.path.join(dirListing1, "trips_offpeak_all_modes_hbm_insuff_high.bin"), n_columns = 4000, tall = False, reindex_rows = False, fill_value = None)


In [None]:
hbm1 = pd.DataFrame(hbm).iloc[:3262, :3262]
#hbm1 = hbm1.iloc[:3262, :3262]
hbm1.shape

In [None]:
hbm1.rename(columns = ggh['ggh_zone'], inplace = True )
hbm1.set_index(ggh['ggh_zone'], inplace = True)

In [None]:
hbm1

In [None]:
hbm1.head()

# Now do for HBW, HBS, and HBU

In [25]:
# batch in ggh zone numbers and add in two columns for i and j zones
ggh['key'] = 0
# make a copy of the df and create square matrix
ggh1 = ggh
ggh2= pd.merge(ggh1, ggh, how='left', on = 'key')

In [26]:
#ggh = ggh.assign(taz_i = ggh['ggh_zone'], taz_j = ggh['ggh_zone'])

In [27]:
%%time
# Merge the hholds info to the trips. By doing so, we can bring in a bunch of household attributes
# including income, dwelling type, size, number of vehicles, and auto_sufficiency. Add in an integer 
# definition for one of six market segments.
trips_hhold = pd.merge(trips, hh, how = 'left', left_on = 'hhid', right_on = 'hhid')

Wall time: 7.39 s


In [28]:
# The trips_out file contains a peak hour factor column that decides whether a trip is sampled
# in the peak or off-peak period. In order to discretely select the peak records and vice-versa
# an uniform random number generator is run and the values are attached to the trips_out file.
# If the (1-peak_factor) value in the record is greater than that of the random value than 
# the record is in the off-peak and vice-versa.

np.random.seed(seed)
random = pd.DataFrame(np.random.uniform(size = len(trips_hhold)))
random.columns = ['rnum']

# attach the random number generator and calculate peak_flag. A value of 1 in this flag
# means that this is a peak period trip record.
trips_hhold = pd.concat([trips_hhold, random], axis = 1)
trips_hhold['peak_flag'] = np.where((1 - trips_hhold['peak_factor']) > trips_hhold['rnum'], 0, 1)
trips_hhold[['peak_flag']] = trips_hhold[['peak_flag']].astype('int8')   # save some memory

In [86]:
%%time


for purpose in mand_purposes:
    
    # because the school and university purposes don't have any market segmentation, set it to 0.
    if purpose in education:
        mand_only = trips_hhold.loc[(trips_hhold['purpose'] == purpose)]
        mand_only['segment'] = 0   # set this to a defauly market segment of 0
    else:
        mand_only = trips_hhold.loc[(trips_hhold['purpose'] == purpose)]    
    
    # now loop over the peak periods
    for peak in range(0,2):  
        
        timeperiod_df = mand_only.loc[mand_only['peak_flag'] == peak]
        timeperiod_df = timeperiod_df.groupby(['taz_i', 'taz_j','purpose', 'segment']).size().reset_index(name = 'freq')
        
        # now loop over the segments
        for segment in timeperiod_df['segment'].unique():
            
            # create filename and then groupby
            # only keep relevant cols and set a flag
            # Merge the ggh zones and the trip list and convert to wide format
            
            fname = purpose + "_" + str(segment)           
            df_hbw = timeperiod_df.loc[timeperiod_df['segment'] == segment]
            df_hbw = df_hbw[['taz_i', 'taz_j']]
            df_hbw['probflag'] = 1
           
            # this merge is not necessary, but I am being on the safe side and bringing in the equiv file we have in TRESO-code
            df_hbw1 = pd.merge(ggh2, df_hbw, how = "left", left_on = ['ggh_zone_x', 'ggh_zone_y'], right_on = ['taz_i', 'taz_j'])
            df_hbw2 = df_hbw1.pivot_table(index = 'ggh_zone_x', columns = 'ggh_zone_y', values = 'probflag', fill_value = 0)

            to_fortran(df_hbw2, os.path.join(dirListing, fname + ' peak_flag ' + str(peak) + '.bin'), n_columns = 4000)

# # Now do it for HBU and HBS
# ed_only = trips_hhold.loc[(trips_hhold['purpose'] == 'HBS') | (trips_hhold['purpose'] == 'HBU')]
# gp_ed = ed_only.groupby(['purpose', 'segment1'])


# for name, data_SchUniv in gp_ed:
#     # create filename and then groupby
#     # only keep relevant cols and set a flag
#     # Merge the ggh zones and the trip list and convert to wide format
#     fname = name[0] + "_" + str(name[1])
#     df = data_SchUniv.groupby(['taz_i', 'taz_j']).size().reset_index()
#     df = df[['taz_i', 'taz_j']]
#     df['flag'] = 1
#     df1 = pd.merge(ggh2, df, how = "left", left_on = ['ggh_zone_x', 'ggh_zone_y'], right_on = ['taz_i', 'taz_j'])
#     df2 = df1.pivot_table(index = 'ggh_zone_x', columns = 'ggh_zone_y', values = 'flag', fill_value = 0)
    
#     to_fortran(df2, os.path.join(dirListing, fname + '.bin'), n_columns = 4000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Wall time: 2min 59s
