In [25]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [26]:
import nltk
import pandas as pd
nltk.download('punkt')
nltk.download('punkt_tab')
from collections import Counter
from nltk import bigrams
from pathlib import Path
from typing import Dict, List

# Data paths
DATA_DIR = Path(r"C:\\Box\\Modeling and Surveys\\Surveys\\Travel Diary Survey\\BATS_2023\\Versioned_Data\\PreWeight_PreLink_MonToSun_20250610")
DATASET_GUIDE = "bats_dataset_guide.html"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\schildress\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\schildress\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [27]:

person_file = DATA_DIR / "person.csv"
trip_file = DATA_DIR / "trip.csv"

person_df = pd.read_csv(person_file)
trip_df = pd.read_csv(trip_file)


In [28]:
trip_df.shape

(373406, 103)

In [29]:
trip_df['mode_other_specify'].shape
trip_df['mode_other_specify'].isna().sum()
trip_df['mode_other_specify'].head(10)

(373406,)

np.int64(371774)

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
Name: mode_other_specify, dtype: str

In [30]:
mode_other_df = trip_df[trip_df['mode_other_specify'].notna()]

mode_other_df.shape  # how many have mode_other_specify filled in?
mode_other_df['mode_other_specify'].value_counts()

(1632, 103)

mode_other_specify
bus                                     132
car                                     124
tour bus                                 84
work truck                               47
my car                                   30
                                       ... 
ar did not leave the house that day.      1
car.                                      1
used my car to drive.                     1
i                                         1
ascensor                                  1
Name: count, Length: 480, dtype: int64

In [31]:
mode_other_df['text_clean'] = (
    mode_other_df['mode_other_specify']
    .str.lower()
    .str.strip()
    .str.replace(r'[^\w\s]', '', regex=True)
)

In [32]:
mode_other_df['text_clean'].value_counts().head(20)

text_clean
car                           141
bus                           139
tour bus                       95
my car                         59
work truck                     52
sf muni transit                39
walked                         32
school bus                     32
sf muni trasit                 30
san francisco muni transit     25
work vehicle                   20
metro                          19
bart                           18
other                          17
none                           17
work van                       14
public bus                     14
cars                           14
tren bart                      14
bart train                     13
Name: count, dtype: int64

In [33]:
mode_other_df['tokens'] = mode_other_df['text_clean'].apply(nltk.word_tokenize)


In [34]:
mode_other_df['tokens']

113            [i, used, my, own, car]
114            [i, used, my, own, car]
115       [traveled, in, my, own, car]
116       [traveled, in, my, own, car]
119            [i, used, my, own, car]
                      ...             
371119                           [bus]
371120                           [bus]
371211                     [snowshoes]
371212                     [snowshoes]
372832                      [ascensor]
Name: tokens, Length: 1632, dtype: object

In [35]:
# Flatten all tokens into one list
all_words = []
for token_list in mode_other_df['tokens']:
    all_words.extend(token_list)

word_counts = Counter(all_words)
pd.DataFrame(word_counts.most_common(30), columns=['word', 'count'])

Unnamed: 0,word,count
0,bus,364
1,car,333
2,i,188
3,to,180
4,my,149
5,muni,118
6,bart,115
7,work,107
8,the,98
9,tour,96


In [36]:
all_bigrams = []
for token_list in mode_other_df['tokens']:
    all_bigrams.extend(list(bigrams(token_list)))

bigram_counts = Counter(all_bigrams)
print("Top 20 two-word phrases:")
pd.DataFrame(bigram_counts.most_common(20), columns=['bigram', 'count'])

Top 20 two-word phrases:


Unnamed: 0,bigram,count
0,"(tour, bus)",95
1,"(my, car)",83
2,"(sf, muni)",69
3,"(muni, transit)",64
4,"(work, truck)",52
5,"(school, bus)",40
6,"(muni, trasit)",30
7,"(own, car)",25
8,"(san, francisco)",25
9,"(francisco, muni)",25


## Language Detection

Let's check how many responses are in languages other than English.

In [37]:
# Simple check: look for non-ASCII characters (indicates non-English like Chinese, etc.)
def has_non_ascii(text):
    if pd.isna(text):
        return False
    try:
        text.encode('ascii')
        return False
    except UnicodeEncodeError:
        return True

mode_other_df['has_non_ascii'] = mode_other_df['mode_other_specify'].apply(has_non_ascii)

print(f"Responses with non-ASCII characters: {mode_other_df['has_non_ascii'].sum()}")
print(f"Percentage: {mode_other_df['has_non_ascii'].sum() / len(mode_other_df) * 100:.1f}%")

Responses with non-ASCII characters: 34
Percentage: 2.1%


In [38]:
# Show examples of non-ASCII responses
non_ascii_responses = mode_other_df[mode_other_df['has_non_ascii']]
print(f"\nExamples of non-ASCII responses ({len(non_ascii_responses)} total):")
non_ascii_responses['mode_other_specify'].value_counts().head(20)


Examples of non-ASCII responses (34 total):


mode_other_specify
拼車                                      6
autobús contra costa                    6
ihss护工的车                                4
到达目的地                                   3
friend‘s car                            2
轻轨                                      2
轻轨\n                                    2
我只是在户外歩行锻练                              2
autobús escolar                         1
autobús para uso traporte médico        1
autobús schoolar                        1
autobús schoolar yellow bus             1
xe nhà                                  1
問題有誤，我已經選擇乘自己的車，還要間出行方式，自相矛盾. 無法完成答題    1
ihss护工的车接送                              1
Name: count, dtype: int64

In [39]:
# Check for Spanish keywords (using ASCII characters)
spanish_keywords = ['tren', 'carro', 'autobus', 'autobús', 'caminando', 'caminar', 
                    'bicicleta', 'metro', 'ascensor', 'trabajo', 'mi', 'casa']

def likely_spanish(text):
    if pd.isna(text):
        return False
    text_lower = str(text).lower()
    # Check if any Spanish keyword appears
    return any(keyword in text_lower for keyword in spanish_keywords)

mode_other_df['likely_spanish'] = mode_other_df['mode_other_specify'].apply(likely_spanish)

# Note: 'metro' could be English too, so this is not perfect
print(f"\nResponses with Spanish keywords: {mode_other_df['likely_spanish'].sum()}")
print(f"Percentage: {mode_other_df['likely_spanish'].sum() / len(mode_other_df) * 100:.1f}%")


Responses with Spanish keywords: 75
Percentage: 4.6%


In [40]:
# Show Spanish examples
spanish_responses = mode_other_df[mode_other_df['likely_spanish'] & ~mode_other_df['has_non_ascii']]
print(f"\nExamples of likely Spanish responses (ASCII only, {len(spanish_responses)} total):")
spanish_responses['mode_other_specify'].value_counts().head(20)


Examples of likely Spanish responses (ASCII only, 65 total):


mode_other_specify
metro                                                                                                                                                                                                                                                                                                                                                           16
tren bart                                                                                                                                                                                                                                                                                                                                                       14
transfer to another metro train                                                                                                                                                                                                                                                

In [41]:
# Summary of non-English responses
mode_other_df['non_english'] = mode_other_df['has_non_ascii'] | mode_other_df['likely_spanish']

print("\n=== Language Summary ===")
print(f"Total responses: {len(mode_other_df)}")
print(f"Non-ASCII (Chinese, etc.): {mode_other_df['has_non_ascii'].sum()} ({mode_other_df['has_non_ascii'].sum()/len(mode_other_df)*100:.1f}%)")
print(f"Likely Spanish (ASCII): {(mode_other_df['likely_spanish'] & ~mode_other_df['has_non_ascii']).sum()} ({(mode_other_df['likely_spanish'] & ~mode_other_df['has_non_ascii']).sum()/len(mode_other_df)*100:.1f}%)")
print(f"Total non-English: {mode_other_df['non_english'].sum()} ({mode_other_df['non_english'].sum()/len(mode_other_df)*100:.1f}%)")
print(f"English: {(~mode_other_df['non_english']).sum()} ({(~mode_other_df['non_english']).sum()/len(mode_other_df)*100:.1f}%)")


=== Language Summary ===
Total responses: 1632
Non-ASCII (Chinese, etc.): 34 (2.1%)
Likely Spanish (ASCII): 65 (4.0%)
Total non-English: 99 (6.1%)
English: 1533 (93.9%)


In [42]:
# Look at ALL non-ASCII responses to identify other languages
print("All non-ASCII responses:")
non_ascii_responses['mode_other_specify'].value_counts()

All non-ASCII responses:


mode_other_specify
拼車                                      6
autobús contra costa                    6
ihss护工的车                                4
到达目的地                                   3
friend‘s car                            2
轻轨                                      2
轻轨\n                                    2
我只是在户外歩行锻练                              2
autobús escolar                         1
autobús para uso traporte médico        1
autobús schoolar                        1
autobús schoolar yellow bus             1
xe nhà                                  1
問題有誤，我已經選擇乘自己的車，還要間出行方式，自相矛盾. 無法完成答題    1
ihss护工的车接送                              1
Name: count, dtype: int64

In [49]:
# Categorize non-ASCII responses by language
def detect_language_from_chars(text):
    """Detect language based on character ranges"""
    if pd.isna(text):
        return 'unknown'
    
    # Check for Chinese characters (CJK Unified Ideographs)
    if any('\u4e00' <= char <= '\u9fff' for char in text):
        return 'Chinese'
    
    # Check for Vietnamese characters (Latin with specific diacritics)
    vietnamese_chars = 'ăâđêôơưàằầèềìòồờùừỳáắấéếíóốớúứý'
    if any(char in vietnamese_chars for char in text.lower()):
        return 'Vietnamese'
    
    # Check for Spanish-specific accented characters
    spanish_chars = 'áéíóúñü'
    if any(char in spanish_chars for char in text.lower()):
        return 'Spanish'
    
    # Other non-ASCII
    return 'Other'

# Apply to non-ASCII responses
non_ascii_responses['detected_language'] = non_ascii_responses['mode_other_specify'].apply(detect_language_from_chars)

print("\n=== Language breakdown of non-ASCII responses ===")
print(non_ascii_responses['detected_language'].value_counts())
print(f"\nTotal non-ASCII responses: {len(non_ascii_responses)}")


=== Language breakdown of non-ASCII responses ===
detected_language
Chinese       21
Vietnamese    11
Other          2
Name: count, dtype: int64

Total non-ASCII responses: 34


In [50]:
# Show Vietnamese examples
vietnamese_responses = non_ascii_responses[non_ascii_responses['detected_language'] == 'Vietnamese']
print("\nVietnamese responses:")
vietnamese_responses['mode_other_specify'].value_counts()


Vietnamese responses:


mode_other_specify
autobús contra costa                 6
autobús escolar                      1
autobús para uso traporte médico     1
autobús schoolar                     1
autobús schoolar yellow bus          1
xe nhà                               1
Name: count, dtype: int64

In [51]:
# Show Chinese examples
chinese_responses = non_ascii_responses[non_ascii_responses['detected_language'] == 'Chinese']
print("\nChinese responses:")
chinese_responses['mode_other_specify'].value_counts()


Chinese responses:


mode_other_specify
拼車                                      6
ihss护工的车                                4
到达目的地                                   3
轻轨                                      2
轻轨\n                                    2
我只是在户外歩行锻练                              2
問題有誤，我已經選擇乘自己的車，還要間出行方式，自相矛盾. 無法完成答題    1
ihss护工的车接送                              1
Name: count, dtype: int64

In [52]:
# Final language summary (correcting for Spanish having non-ASCII too)
print("\n=== Corrected Language Summary ===")
print(f"Total responses: {len(mode_other_df)}")
print(f"\nNon-English responses:")
print(f"  Chinese: 21 ({21/len(mode_other_df)*100:.2f}%)")
print(f"  Spanish (with accents like autobús): 10 ({10/len(mode_other_df)*100:.2f}%)")
print(f"  Spanish (ASCII-only, like 'tren bart'): ~55 ({55/len(mode_other_df)*100:.2f}%)")
print(f"  Vietnamese (xe nhà): 1 ({1/len(mode_other_df)*100:.2f}%)")
print(f"  Other (friend's car, etc.): 2 ({2/len(mode_other_df)*100:.2f}%)")
print(f"\nTotal estimated non-English: ~89 (5.5%)")
print(f"English: ~1543 (94.5%)")


=== Corrected Language Summary ===
Total responses: 1632

Non-English responses:
  Chinese: 21 (1.29%)
  Spanish (with accents like autobús): 10 (0.61%)
  Spanish (ASCII-only, like 'tren bart'): ~55 (3.37%)
  Vietnamese (xe nhà): 1 (0.06%)
  Other (friend's car, etc.): 2 (0.12%)

Total estimated non-English: ~89 (5.5%)
English: ~1543 (94.5%)


## Check Valid Mode Categories

Let's see what mode variables exist in the data and what their valid values are.

In [44]:
# Check what mode columns exist
mode_cols = [col for col in trip_df.columns if 'mode' in col.lower()]
print("Mode-related columns:")
for col in mode_cols:
    print(f"  {col}")

Mode-related columns:
  mode_type
  mode_1
  mode_2
  mode_3
  mode_4
  mode_other_specify


## First Round Recoding to mode_1 Codes

Map free-text responses to actual mode_1 codes (when possible). Return None for ambiguous cases needing review.

In [45]:
def recode_mode_other_to_mode1(text_clean):
    """
    Recode free-text mode responses to mode_1 codes.
    Returns mode_1 code (int) or None for unclear cases.
    
    Based on mode_1 codes from dataset guide:
    1=Walk, 2=Bicycle, 23=Local bus, 24=School bus, 30=BART, 
    33=Car from work, 36=Taxi, 49=TNC (Uber/Lyft), 53=MUNI Metro,
    6-16=Household vehicles, 82=E-bike, 83=Scooter-share, etc.
    """
    if pd.isna(text_clean):
        return None
    
    text = str(text_clean).lower()
    
    # Non-informative - mark for removal
    if text in ['none', 'other', 'nothing', 'na', 'n/a', 'i', '']:
        return 'JUNK'
    
    # Didn't travel - mark for removal
    if 'didnt' in text or "didn't" in text or 'did not' in text:
        return 'JUNK'
    
    # 24: School bus
    if 'school bus' in text or 'schoolbus' in text:
        return 24
    
    # 30: BART
    if 'bart' in text or 'tren bart' in text:
        return 30
    
    # 53: MUNI Metro (for SF Muni variations)
    if any(phrase in text for phrase in ['muni', 'sf transit', 'san francisco transit']):
        return 53
    
    # 23: Local public bus (excluding tour/school/work buses)
    if 'bus' in text and not any(x in text for x in ['tour', 'school', 'work', 'shuttle', 'company', 'employer']):
        return 23
    
    # 49: Uber/Lyft/TNC
    if any(word in text for word in ['uber', 'lyft', 'rideshare', 'ride share', 'ride service']):
        return 49
    
    # 36: Regular taxi
    if 'taxi' in text:
        return 36
    
    # 1: Walk
    if any(word in text for word in ['walk', 'walked', 'walking']) and 'bike' not in text:
        return 1
    
    # 82: Electric bicycle (household)
    if any(phrase in text for phrase in ['ebike', 'e-bike', 'electric bike', 'electric bicycle']):
        return 82
    
    # 83: Scooter-share
    if any(phrase in text for phrase in ['bird', 'lime', 'scooter share', 'shared scooter']):
        return 83
    
    # 77: Personal scooter/moped (not shared)
    if 'scooter' in text or 'moped' in text:
        return 77
    
    # 2: Standard bicycle
    if any(word in text for word in ['bike', 'bicycle', 'cycling']) and 'e-bike' not in text:
        return 2
    
    # 47: Motorcycle (household)
    if 'motorcycle' in text or 'motorbike' in text:
        return 47
    
    # 33: Car from work / work vehicle
    if any(phrase in text for phrase in ['work truck', 'work van', 'work vehicle', 'work car', 'company truck', 'company van', 'company vehicle', 'company car']):
        return 33
    
    # 6: Household vehicle (for "my car", "own car", "car", etc.)
    if any(phrase in text for phrase in ['my car', 'own car', 'personal car', 'private car', 'household car']) or text in ['car', 'cars']:
        return 6
    
    # Ambiguous/needs review
    return None

In [46]:
# Apply the recoding function
mode_other_df['mode_1_recoded'] = mode_other_df['text_clean'].apply(recode_mode_other_to_mode1)

# Show summary of recoded categories
print("Recoded mode_1 distribution:")
mode_other_df['mode_1_recoded'].value_counts(dropna=False)

Recoded mode_1 distribution:


  mode_other_df['mode_1_recoded'] = mode_other_df['text_clean'].apply(recode_mode_other_to_mode1)


mode_1_recoded
None    622
6       267
23      196
30      113
53      112
33       90
JUNK     83
1        77
24       40
2        14
77        6
82        6
49        4
83        2
Name: count, dtype: int64

In [47]:
# Check what's still uncoded (None values)
uncoded = mode_other_df[mode_other_df['mode_1_recoded'].isna()]
print(f"Total uncoded: {len(uncoded)} out of {len(mode_other_df)} ({len(uncoded)/len(mode_other_df)*100:.1f}%)")
print("\nMost common uncoded responses:")
uncoded['text_clean'].value_counts().head(30)

Total uncoded: 622 out of 1632 (38.1%)

Most common uncoded responses:


text_clean
tour bus                           95
metro                              19
skateboard                         13
running                            12
b                                  12
train                              11
working van                        11
shuttle                             9
friends car                         7
a car                               7
ups truck                           7
friend drive                        7
tractor trailer                     7
idk                                 6
uhaul truck                         6
trail run by foot                   6
拼車                                  6
transfer to another metro train     6
autobús contra costa                6
gondola                             5
gig car share                       5
shuttle bus                         5
ski                                 5
kaiser van                          5
amtrak                              4
my feet                             4
c

In [48]:
# Sample of original text and recoded mode_1 for quality check
sample_recode = mode_other_df[['mode_other_specify', 'text_clean', 'mode_1_recoded']].sample(20, random_state=42)
sample_recode

Unnamed: 0,mode_other_specify,text_clean,mode_1_recoded
105816,bart and ferry,bart and ferry,30.0
371118,bus,bus,23.0
81777,no,no,
62360,bus,bus,23.0
150438,shuttle,shuttle,
197919,I was in Muni Merro,i was in muni merro,53.0
67577,gig car share,gig car share,
333883,subaru,subaru,
279697,bart,bart,30.0
37699,work van,work van,33.0
