In [None]:
!pip install fastparquet

In [None]:
import fastparquet as fp

In [None]:
import pandas as pd

travel_trip_complete_df = pd.read_csv("travel_trip_complete.csv")
travel_trip_complete_safina_df = travel_trip_complete_df.copy()
travel_trip_complete_safina_df

In [None]:
import pandas as pd

age_bins = [0, 26, 34, 41, 51, 61, 120]
age_labels = [
    "18-25 Students-Early Professionals",   # [0,26)  -> effectively 18â€“25 in your data
    "26-33 Young Professionals",            # [26,34)
    "34-40 Young Parents",                  # [34,41)
    "41-50 Money & Energy Group",           # [41,51)
    "51-60 Rich but Tired",                 # [51,61)
    "61+ Retired-Elderly"                   # [61,120)
]

travel_trip_complete_safina_df["traveler_age_segment"] = pd.cut(
    travel_trip_complete_safina_df["traveler_age"],
    bins=age_bins,
    labels=age_labels,
    right=False
)

travel_trip_complete_safina_df["traveler_age_segment"].value_counts()

In [None]:
import pandas as pd
import numpy as np

# --- 0. Setup and Data Loading ---
file_name = "travel_trip_complete.csv"

try:
    # Load the data file
    travel_trip_safina_df = pd.read_csv(file_name)
except FileNotFoundError:
    print(f"Error: File '{file_name}' not found. Please ensure the file is in the current directory.")
    raise

# --- 1. Create Required Metrics (Cost Per Day) ---
for col in ['transportation_cost', 'accommodation_cost', 'duration_days']:
    # Ensure columns are numeric
    travel_trip_safina_df[col] = pd.to_numeric(travel_trip_safina_df[col], errors='coerce')

travel_trip_safina_df['total_cost'] = travel_trip_safina_df['transportation_cost'] + travel_trip_safina_df['accommodation_cost']

# Calculate Cost Per Day (CPD)
travel_trip_safina_df['duration_days'] = travel_trip_safina_df['duration_days'].replace(0, np.nan)
travel_trip_safina_df['cost_per_day'] = travel_trip_safina_df['total_cost'] / travel_trip_safina_df['duration_days']


# --- 2. Identify Top 10 Most Expensive Destinations by AVERAGE CPD ---
df_clean = travel_trip_safina_df.dropna(subset=['cost_per_day', 'destination'])
avg_cpd_by_destination = df_clean.groupby('destination')['cost_per_day'].mean().sort_values(ascending=False)
top_10_expensive_destinations = avg_cpd_by_destination.head(10).index.tolist()

plot_df_filtered = travel_trip_safina_df[
    travel_trip_safina_df['destination'].isin(top_10_expensive_destinations)
]

# --- 3. Prepare Data for Table (Avg CPD & Avg Duration) ---
plot_data = plot_df_filtered.groupby('destination').agg(
    avg_cpd=('cost_per_day', 'mean'),
    avg_duration=('duration_days', 'mean')
).reset_index().sort_values(by='avg_cpd', ascending=False)

# Round values for clean presentation
plot_data['avg_cpd'] = plot_data['avg_cpd'].astype(float).round(2)
plot_data['avg_duration'] = plot_data['avg_duration'].astype(float).round(1)


# --- 4. Print Final Data Table ---
print("\n--- Final Data Table (Top 10 Most Expensive Destinations) ---")
# This command prints the DataFrame neatly to the console
print(plot_data.to_string(index=False))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- 0. Setup and Data Loading ---
file_name = "travel_trip_complete.csv"
try:
    # Load the data file
    travel_trip_safina_df = pd.read_csv(file_name)
except FileNotFoundError:
    print(f"Error: File '{file_name}' not found. Please ensure the file is in the current directory.")

# --- 1. Create Required Metrics (Cost Per Day) ---
for col in ['transportation_cost', 'accommodation_cost', 'duration_days']:
    # Ensure columns are numeric
    travel_trip_safina_df[col] = pd.to_numeric(travel_trip_safina_df[col], errors='coerce')

# Calculate Total Cost
travel_trip_safina_df['total_cost'] = travel_trip_safina_df['transportation_cost'] + travel_trip_safina_df['accommodation_cost']

# Calculate Cost Per Day (CPD)
# Replace 0 in duration_days with NaN to prevent division by zero
travel_trip_safina_df['duration_days'] = travel_trip_safina_df['duration_days'].replace(0, np.nan)
travel_trip_safina_df['cost_per_day'] = travel_trip_safina_df['total_cost'] / travel_trip_safina_df['duration_days']

# --- 2. Identify Top 10 Most Expensive Destinations by AVERAGE CPD ---
# Drop rows with missing Cost Per Day values for clean averaging
df_clean = travel_trip_safina_df.dropna(subset=['cost_per_day', 'destination'])
avg_cpd_by_destination = df_clean.groupby('destination')['cost_per_day'].mean().sort_values(ascending=False)
top_10_expensive_destinations = avg_cpd_by_destination.head(10).index.tolist()

# Filter the original DataFrame for only the top 10 destinations
plot_df_filtered = travel_trip_safina_df[
    travel_trip_safina_df['destination'].isin(top_10_expensive_destinations)
]

# --- 3. Prepare Data for Table and Plot (Avg CPD & Avg Duration) ---
plot_data = plot_df_filtered.groupby('destination').agg(
    avg_cpd=('cost_per_day', 'mean'),
    avg_duration=('duration_days', 'mean')
).reset_index().sort_values(by='avg_cpd', ascending=False)

# Round values for clean presentation
plot_data['avg_cpd'] = plot_data['avg_cpd'].astype(float).round(2)
plot_data['avg_duration'] = plot_data['avg_duration'].astype(float).round(1)

# --- 4. Print Final Data Table ---
print("\n--- Final Data Table (Top 10 Most Expensive Destinations) ---")
print(plot_data.to_string(index=False))

# --- 5. Generate Graph (Bar Chart) ---

# Set a style for the plot
sns.set_style("whitegrid")

# Create the figure and axes
plt.figure(figsize=(12, 7))

# Create the bar chart
ax = sns.barplot(
    x='destination',
    y='avg_cpd',
    data=plot_data,
    palette='viridis'
)

# Add titles and labels
plt.title('Top 10 Most Expensive Destinations by Average Cost Per Day (CPD)', fontsize=16)
plt.xlabel('Destination', fontsize=12)
plt.ylabel('Average Cost Per Day (CPD) ($)', fontsize=12)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add the average CPD value on top of each bar
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'${height:,.0f}',
                (p.get_x() + p.get_width() / 2., height),
                ha = 'center', va = 'center',
                xytext = (0, 9),
                textcoords = 'offset points',
                fontsize=10)

# Clean up layout
plt.tight_layout()

# Save the plot
plt.savefig('top_10_expensive_destinations_cpd_bar_chart.png')
print("\nGraph saved as 'top_10_expensive_destinations_cpd_bar_chart.png'")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- 0. Setup and Data Loading ---
file_name = "travel_trip_complete.csv"
try:
    travel_trip_safina_df = pd.read_csv(file_name)
except FileNotFoundError:
    print(f"Error: File '{file_name}' not found.")
    raise

# --- 1. Create Required Metrics (Cost Per Day) ---
for col in ['transportation_cost', 'accommodation_cost', 'duration_days']:
    travel_trip_safina_df[col] = pd.to_numeric(travel_trip_safina_df[col], errors='coerce')

travel_trip_safina_df['total_cost'] = travel_trip_safina_df['transportation_cost'] + travel_trip_safina_df['accommodation_cost']

# Calculate Cost Per Day (CPD)
travel_trip_safina_df['duration_days'] = travel_trip_safina_df['duration_days'].replace(0, np.nan)
travel_trip_safina_df['cost_per_day'] = travel_trip_safina_df['total_cost'] / travel_trip_safina_df['duration_days']

# --- 2. Identify Top 10 Most Expensive Destinations by AVERAGE CPD ---
df_clean = travel_trip_safina_df.dropna(subset=['cost_per_day', 'destination'])
avg_cpd_by_destination = df_clean.groupby('destination')['cost_per_day'].mean().sort_values(ascending=False)
top_10_expensive_destinations = avg_cpd_by_destination.head(10).index.tolist()

plot_df_filtered = travel_trip_safina_df[
    travel_trip_safina_df['destination'].isin(top_10_expensive_destinations)
]

# --- 3. Prepare Data for Plot (Avg CPD & Avg Duration) ---
plot_data = plot_df_filtered.groupby('destination').agg(
    avg_cpd=('cost_per_day', 'mean'),
    avg_duration=('duration_days', 'mean')
).reset_index().sort_values(by='avg_cpd', ascending=False)

plot_data['avg_cpd'] = plot_data['avg_cpd'].astype(float).round(2)
plot_data['avg_duration'] = plot_data['avg_duration'].astype(float).round(1)

# --- 4. Generate Graph (Scatter Plot) to Test Hypothesis ---
sns.set_style("whitegrid")
plt.figure(figsize=(12, 8))

# Create the scatter plot
sns.scatterplot(
    x='avg_duration',
    y='avg_cpd',
    data=plot_data,
    s=200,
    color='darkorange',
    edgecolor='k',
    alpha=0.7
)

# Add titles and labels
plt.title('Average Cost Per Day vs. Average Trip Duration for Top 10 Most Expensive Destinations', fontsize=16, pad=20)
plt.xlabel('Average Duration (Days)', fontsize=12)
plt.ylabel('Average Cost Per Day (CPD) ($)', fontsize=12)

# Add annotations (destination names) to each point
for i in range(len(plot_data)):
    plt.annotate(
        plot_data['destination'].iloc[i],
        (plot_data['avg_duration'].iloc[i] + 0.1, plot_data['avg_cpd'].iloc[i]),
        fontsize=9,
        alpha=0.8
    )

# Add a trend line to visually assess the correlation
z = np.polyfit(plot_data['avg_duration'], plot_data['avg_cpd'], 1)
p = np.poly1d(z)
plt.plot(plot_data['avg_duration'], p(plot_data['avg_duration']), "r--", label=f"Trend Line (Slope: {z[0]:.2f})")
plt.legend()

plt.xlim(plot_data['avg_duration'].min() * 0.9, plot_data['avg_duration'].max() * 1.1)
plt.ylim(plot_data['avg_cpd'].min() * 0.9, plot_data['avg_cpd'].max() * 1.1)

plt.tight_layout()
plt.savefig('cpd_vs_duration_scatter_plot.png')

In [None]:
import pandas as pd
import numpy as np

# --- 0. Data Loading ---
file_name = "travel_trip_complete.csv"
try:
    travel_trip_safina_df = pd.read_csv(file_name)
except FileNotFoundError:
    print(f"Error: File '{file_name}' not found.")

# --- 1. Create Required Metrics (Cost Per Day) ---
for col in ['transportation_cost', 'accommodation_cost', 'duration_days']:
    # Ensure columns are numeric
    travel_trip_safina_df[col] = pd.to_numeric(travel_trip_safina_df[col], errors='coerce')

# Calculate Total Cost
travel_trip_safina_df['total_cost'] = travel_trip_safina_df['transportation_cost'] + travel_trip_safina_df['accommodation_cost']

# Calculate Cost Per Day (CPD)
# Replace 0 in duration_days with NaN to prevent division by zero
travel_trip_safina_df['duration_days'] = travel_trip_safina_df['duration_days'].replace(0, np.nan)
travel_trip_safina_df['cost_per_day'] = travel_trip_safina_df['total_cost'] / travel_trip_safina_df['duration_days']

# --- 2. Identify Top 10 Most Expensive Destinations by AVERAGE CPD ---
df_clean = travel_trip_safina_df.dropna(subset=['cost_per_day', 'destination'])
avg_cpd_by_destination = df_clean.groupby('destination')['cost_per_day'].mean().sort_values(ascending=False)
top_10_expensive_destinations = avg_cpd_by_destination.head(10).index.tolist()

plot_df_filtered = travel_trip_safina_df[
    travel_trip_safina_df['destination'].isin(top_10_expensive_destinations)
]

# --- 3. Prepare Data for Table (Avg CPD & Avg Duration) ---
plot_data = plot_df_filtered.groupby('destination').agg(
    avg_cpd=('cost_per_day', 'mean'),
    avg_duration=('duration_days', 'mean')
).reset_index().sort_values(by='avg_cpd', ascending=False)

# Round values for clean presentation
plot_data['avg_cpd'] = plot_data['avg_cpd'].astype(float).round(2)
plot_data['avg_duration'] = plot_data['avg_duration'].astype(float).round(1)

# --- 4. Print Final Data Table ---
print("\n--- Final Data Table (Top 10 Most Expensive Destinations) ---")
# This command prints the DataFrame neatly to the console
print(plot_data.to_string(index=False))