In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker


In [2]:
file_path = './Travel details dataset.csv'
df = pd.read_csv(file_path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139 entries, 0 to 138
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Trip ID               139 non-null    int64  
 1   Destination           137 non-null    object 
 2   Start date            137 non-null    object 
 3   End date              137 non-null    object 
 4   Duration (days)       137 non-null    float64
 5   Traveler name         137 non-null    object 
 6   Traveler age          137 non-null    float64
 7   Traveler gender       137 non-null    object 
 8   Traveler nationality  137 non-null    object 
 9   Accommodation type    137 non-null    object 
 10  Accommodation cost    137 non-null    object 
 11  Transportation type   136 non-null    object 
 12  Transportation cost   136 non-null    object 
dtypes: float64(2), int64(1), object(10)
memory usage: 14.2+ KB


In [4]:
df.isnull().sum()

Trip ID                 0
Destination             2
Start date              2
End date                2
Duration (days)         2
Traveler name           2
Traveler age            2
Traveler gender         2
Traveler nationality    2
Accommodation type      2
Accommodation cost      2
Transportation type     3
Transportation cost     3
dtype: int64

### Data Cleaning

In [5]:
# Standardize Column Names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df.columns


Index(['trip_id', 'destination', 'start_date', 'end_date', 'duration_(days)',
       'traveler_name', 'traveler_age', 'traveler_gender',
       'traveler_nationality', 'accommodation_type', 'accommodation_cost',
       'transportation_type', 'transportation_cost'],
      dtype='object')

In [6]:
# Convert Cost Columns to Numeric
df['accommodation_cost'] = df['accommodation_cost'].replace(r'[^\d.]', '', regex=True).astype(float)
df['transportation_cost'] = df['transportation_cost'].replace(r'[^\d.]', '', regex=True).astype(float)

In [7]:
# Chack missing values
missing_summary = df.isnull().sum()
print(missing_summary)

trip_id                 0
destination             2
start_date              2
end_date                2
duration_(days)         2
traveler_name           2
traveler_age            2
traveler_gender         2
traveler_nationality    2
accommodation_type      2
accommodation_cost      2
transportation_type     3
transportation_cost     3
dtype: int64


In [8]:
# Reviwe rows with missing data
missing_rows = df[df.isnull().any(axis=1)]
print(missing_rows.head())

     trip_id  destination start_date   end_date  duration_(days)  \
71        72          NaN        NaN        NaN              NaN   
82        83  Rome, Italy  4/15/2025  4/22/2025              7.0   
127      128          NaN        NaN        NaN              NaN   

    traveler_name  traveler_age traveler_gender traveler_nationality  \
71            NaN           NaN             NaN                  NaN   
82      James Kim          41.0            Male             American   
127           NaN           NaN             NaN                  NaN   

    accommodation_type  accommodation_cost transportation_type  \
71                 NaN                 NaN                 NaN   
82               Hotel               100.0                 NaN   
127                NaN                 NaN                 NaN   

     transportation_cost  
71                   NaN  
82                   NaN  
127                  NaN  


In [9]:
# Handle missing values
df = df.dropna(subset= [
               'destination', 'start_date', 'end_date', 'duration_(days)',
    'traveler_name', 'traveler_age', 'traveler_gender',
    'traveler_nationality', 'accommodation_type', 'accommodation_cost',
    'transportation_type', 'transportation_cost'
])

### Data Enrichment

In [10]:
# Impute Missing Costs with Average

## Impute accommodation cost
df['accommodation_cost'] = df.groupby(['destination', 'accommodation_type'])['accommodation_cost'] \
                            .transform( lambda x: x.fillna(x.mean))

df['transportation_cost'] = df.groupby(['destination', 'transportation_type'])['transportation_cost'] \
                            .transform(lambda x: x.fillna(x.mean()))

In [11]:
# Add seasonality tags based on travel month

# convert start_date to datetime
df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')

# extract travel month
df['travel_month'] = df['start_date'].dt.month

In [12]:
# Define a function to map months to season

def get_season(month):
    if month in [12,1,2]:
        return 'Winter'
    elif month in [3,4,5]:
        return 'Spring'
    elif month in [6,7,8]:
        return 'Summer'
    elif month in [9,10,11]:
        return 'Autumn'
    return 'Unknown'

df['season'] = df['travel_month'].apply(get_season)

In [13]:
# Add interest categories based on destinaton

# Simple map for destination interests
interest_map = {
    'London': 'History',
    'Phuket': 'Beach',
    'Bali': 'Beach',
    'New York': 'Shopping',
    'Tokyo': 'Culture',
    'Paris': 'Romance',
    'Sydney': 'Nature',
    'Rio de Janeiro': 'Beach',
    'Amsterdam': 'Art',
    'Dubai': 'Luxury',
    'Cancun': 'Beach',
    'Barcelona': 'Art',
    'Honolulu': 'Beach',
    'Berlin': 'History',
    'Marrakech': 'Culture',
    'Edinburgh': 'History',
    'Rome': 'Culture',
    'Bangkok': 'Street Life',
    'Hawaii': 'Beach',
    'Japan': 'Culture',
    'Thailand': 'Beach',
    'France': 'Romance',
    'Australia': 'Nature',
    'Brazil': 'Carnival',
    'Greece': 'History',
    'Egypt': 'History',
    'Mexico': 'Culture',
    'Italy': 'Art',
    'Spain': 'Culture',
    'Canada': 'Nature',
    'New York City': 'Shopping',
    'Vancouver': 'Nature',
    'Seoul': 'Tech & Culture',
    'Los Angeles': 'Entertainment',
    'Cape Town': 'Adventure',
    'Santorini': 'Romance',
    'Phnom Penh': 'History',
    'Athens': 'Ancient History',
    'Auckland': 'Nature',
    'Cairo': 'History',
    'Reykjavik': 'Nature'
}


# Map to a new column
df['interest_category'] = df['destination'].map(lambda x: interest_map.get(str(x).split(',')[0], 'General'))

In [14]:
df['traveler_age'].unique()

array([35., 28., 45., 29., 26., 42., 33., 25., 31., 39., 27., 36., 48.,
       32., 30., 40., 24., 34., 20., 37., 38., 55., 41., 23., 43., 46.,
       60., 50., 47.])

In [15]:
# Add synthetic travelers rows

fake = Faker()
new_rows = []

accommodation_cost_map = {
    'Hostel': (100,300),
    'Airbnb': (200,1000),
    'Hotel': (500,1200),
    'Resort': (800, 1500)
}

transportation_cost_map = {
    'Bus': (50,200),
    'Train': (100, 400),
    'Flight': (300, 1000)
}


for _ in range(100):
    destination = random.choice(list(interest_map.keys()))

    acc_type = random.choice(list(accommodation_cost_map.keys()))
    trans_type = random.choice(list(transportation_cost_map.keys()))

    acc_min, acc_max = accommodation_cost_map[acc_type]
    acc_cost = random.randint(acc_min, acc_max)

    trans_min, trans_max = transportation_cost_map[trans_type]
    trans_cost = random.randint(trans_min, trans_max)

    duration = random.choice([int(x) for x in list(df['duration_(days)'].unique()) if not np.isnan(x)])
    start_date = fake.date_between(start_date='-1y', end_date='today')
    end_date = start_date + pd.to_timedelta(duration, unit='D')

    new_rows.append({
        'trip_id': df['trip_id'].max() + len(new_rows) + 1,
        'destination' : destination,
        'start_date': start_date,
        'end_date': end_date,
        'duration_(days)': duration,
        'traveler_name': fake.name(),
        'traveler_age': random.randint(18,65),
        'traveler_gender': random.choice(['Male', 'Female']),
        'traveler_nationality': fake.country(),
        'accommodation_type': acc_type,
        'accommodation_cost': acc_cost,
        'transportation_type': trans_type,
        'transportation_cost': trans_cost,
        'travel_month': random.randint(1,12),
    })


In [16]:
# Append the new synthetic records
df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236 entries, 0 to 235
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   trip_id               236 non-null    int64  
 1   destination           236 non-null    object 
 2   start_date            236 non-null    object 
 3   end_date              236 non-null    object 
 4   duration_(days)       236 non-null    float64
 5   traveler_name         236 non-null    object 
 6   traveler_age          236 non-null    float64
 7   traveler_gender       236 non-null    object 
 8   traveler_nationality  236 non-null    object 
 9   accommodation_type    236 non-null    object 
 10  accommodation_cost    236 non-null    float64
 11  transportation_type   236 non-null    object 
 12  transportation_cost   236 non-null    float64
 13  travel_month          236 non-null    int64  
 14  season                136 non-null    object 
 15  interest_category     1

In [18]:
df.destination.unique()

array(['London, UK', 'Phuket, Thailand', 'Bali, Indonesia',
       'New York, USA', 'Tokyo, Japan', 'Paris, France',
       'Sydney, Australia', 'Rio de Janeiro, Brazil',
       'Amsterdam, Netherlands', 'Dubai, United Arab Emirates',
       'Cancun, Mexico', 'Barcelona, Spain', 'Honolulu, Hawaii',
       'Berlin, Germany', 'Marrakech, Morocco', 'Edinburgh, Scotland',
       'Paris', 'Bali', 'London', 'Tokyo', 'New York', 'Sydney', 'Rome',
       'Bangkok', 'Hawaii', 'Barcelona', 'Japan', 'Thailand', 'France',
       'Australia', 'Brazil', 'Greece', 'Egypt', 'Mexico', 'Italy',
       'Spain', 'Canada', 'New York City, USA', 'Bangkok, Thailand',
       'Vancouver, Canada', 'Sydney, AUS', 'Seoul, South Korea',
       'Los Angeles, USA', 'Rome, Italy', 'Cape Town', 'Cape Town, SA',
       'Sydney, Aus', 'Bangkok, Thai', 'Phuket, Thai', 'Dubai', 'Seoul',
       'Rio de Janeiro', 'Amsterdam', 'Phuket', 'Santorini', 'Phnom Penh',
       'Athens, Greece', 'Cape Town, South Africa',
       'Au

In [19]:
# Handle remaining missing values
df.season = df.travel_month.apply(get_season)
df['interest_category'] = df['destination'].map(lambda x: interest_map.get(str(x).split(',')[0], 'General'))

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236 entries, 0 to 235
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   trip_id               236 non-null    int64  
 1   destination           236 non-null    object 
 2   start_date            236 non-null    object 
 3   end_date              236 non-null    object 
 4   duration_(days)       236 non-null    float64
 5   traveler_name         236 non-null    object 
 6   traveler_age          236 non-null    float64
 7   traveler_gender       236 non-null    object 
 8   traveler_nationality  236 non-null    object 
 9   accommodation_type    236 non-null    object 
 10  accommodation_cost    236 non-null    float64
 11  transportation_type   236 non-null    object 
 12  transportation_cost   236 non-null    float64
 13  travel_month          236 non-null    int64  
 14  season                236 non-null    object 
 15  interest_category     2

In [22]:
# Add a column for total cost of a trip
df['total_cost'] = df['accommodation_cost'] + df['transportation_cost']

In [24]:
df.to_csv('./cleaned_travel_dataset.csv')