In [1]:
import pandas as pd

df = pd.read_csv("travel_recommendations_10k.csv")

print(df.columns)
df

Index(['user_id', 'age', 'gender', 'hobby', 'budget', 'travel_style',
       'destination_city', 'destination_country', 'activities', 'climate',
       'travel_rating'],
      dtype='object')


Unnamed: 0,user_id,age,gender,hobby,budget,travel_style,destination_city,destination_country,activities,climate,travel_rating
0,1,23,Male,History,Low,Group,Rome,Italy,"Ancient Ruins, Museums",Mediterranean,4.2
1,2,29,Male,Foodie,Medium,Couple,Bangkok,Thailand,"Street Food, Markets",Tropical,3.7
2,3,43,Male,History,Low,Couple,Cairo,Egypt,"Pyramids, Museums",Desert,3.2
3,4,28,Female,Beaches,Medium,Solo,Phuket,Thailand,"Island Hopping, Beaches",Tropical,4.3
4,5,28,Male,Foodie,Medium,Couple,Lyon,France,"Gourmet Dining, Cafes",Mild,4.5
...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,21,Male,Wildlife,Medium,Family,Borneo,Malaysia,"Rainforest Tours, Wildlife Watching",Rainforest,4.4
9996,9997,21,Male,Hiking,High,Group,Banff,Canada,"Lake Trails, Snow Hiking",Cold,3.8
9997,9998,41,Male,Foodie,Medium,Solo,Bangkok,Thailand,"Street Food, Markets",Tropical,4.0
9998,9999,40,Male,History,High,Family,Athens,Greece,"Historical Sites, Cultural Tours",Warm,3.9


In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Step 1: Load the Dataset
# Replace 'travel_data.csv' with the name of your dataset file
df = pd.read_csv('travel_recommendations_10k.csv')

# Step 2: Handle Missing Values
# Define numerical and categorical columns
numerical_columns = ['age', 'travel_rating']  # Add other numerical columns as needed
categorical_columns = ['gender', 'hobby', 'budget', 'travel_style', 'destination_city', 'destination_country', 'activities', 'climate']

# Create imputers for missing values
imputer_num = SimpleImputer(strategy='mean')  # Fill missing numerical values with mean
imputer_cat = SimpleImputer(strategy='most_frequent')  # Fill missing categorical values with mode

# Apply imputers
df[numerical_columns] = imputer_num.fit_transform(df[numerical_columns])
df[categorical_columns] = imputer_cat.fit_transform(df[categorical_columns])

# Step 3: Categorical Encoding
# Use OneHotEncoder to encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(df[categorical_columns])

# Convert encoded data back to a DataFrame
encoded_columns = encoder.get_feature_names_out(categorical_columns)
df_encoded = pd.DataFrame(encoded_data, columns=encoded_columns)

# Drop original categorical columns and append encoded columns
df = df.drop(columns=categorical_columns).reset_index(drop=True)
df = pd.concat([df, df_encoded], axis=1)

# Step 4: Feature Scaling
# Scale numerical columns using StandardScaler
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Step 5: Save and Display the Output Dataset
# Save the cleaned dataset to a CSV file for further processing
df.to_csv('cleaned_travel_data.csv', index=False)

# Display a sample of the preprocessed dataset
print("Preprocessed dataset sample:")
print(df.head())


df

Preprocessed dataset sample:
   user_id       age  travel_rating  gender_Female  gender_Male  \
0        1 -1.226765       0.419392            0.0          1.0   
1        2 -0.602372      -0.593339            0.0          1.0   
2        3  0.854544      -1.606069            0.0          1.0   
3        4 -0.706438       0.621938            1.0          0.0   
4        5 -0.706438       1.027030            0.0          1.0   

   gender_Non-binary  hobby_Art  hobby_Beaches  hobby_Foodie  hobby_Hiking  \
0                0.0        0.0            0.0           0.0           0.0   
1                0.0        0.0            0.0           1.0           0.0   
2                0.0        0.0            0.0           0.0           0.0   
3                0.0        0.0            1.0           0.0           0.0   
4                0.0        0.0            0.0           1.0           0.0   

   ...  climate_Desert  climate_Mediterranean  climate_Mild  \
0  ...             0.0              

Unnamed: 0,user_id,age,travel_rating,gender_Female,gender_Male,gender_Non-binary,hobby_Art,hobby_Beaches,hobby_Foodie,hobby_Hiking,...,climate_Desert,climate_Mediterranean,climate_Mild,climate_Mountainous,climate_Rainforest,climate_Savannah,climate_Temperate,climate_Tropical,climate_Varied,climate_Warm
0,1,-1.226765,0.419392,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,-0.602372,-0.593339,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,3,0.854544,-1.606069,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,-0.706438,0.621938,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,5,-0.706438,1.027030,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,-1.434896,0.824484,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9996,9997,-1.434896,-0.390793,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,9998,0.646413,0.014300,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9998,9999,0.542347,-0.188246,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
df = pd.read_csv('travel_recommendations_10k.csv')

# Step 1: Handle Missing Values
numerical_columns = ['age', 'travel_rating']  # Adjust based on dataset
categorical_columns = ['gender', 'hobby', 'budget', 'travel_style', 'activities', 'climate']

# Retain 'destination_city' and 'destination_country' separately for recommendations
metadata_columns = ['destination_city', 'destination_country']

imputer_num = SimpleImputer(strategy='mean')
imputer_cat = SimpleImputer(strategy='most_frequent')

# Apply imputers
df[numerical_columns] = imputer_num.fit_transform(df[numerical_columns])
df[categorical_columns] = imputer_cat.fit_transform(df[categorical_columns])

# Step 2: Categorical Encoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(df[categorical_columns])
encoded_columns = encoder.get_feature_names_out(categorical_columns)
df_encoded = pd.DataFrame(encoded_data, columns=encoded_columns)

# Append encoded features back, but retain metadata columns
df_encoded = pd.concat([df[metadata_columns].reset_index(drop=True), df_encoded], axis=1)

# Step 3: Feature Scaling
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Step 4: Dimensionality Reduction (PCA)
pca = PCA(n_components=50)
pca_features = pca.fit_transform(df_encoded.drop(columns=metadata_columns))  # Exclude metadata from PCA
pca_columns = [f"PCA_{i}" for i in range(pca_features.shape[1])]
df_pca = pd.DataFrame(pca_features, columns=pca_columns)

# Combine PCA features, numerical columns, and metadata
df_final = pd.concat([df[metadata_columns].reset_index(drop=True), df[numerical_columns].reset_index(drop=True), df_pca], axis=1)

# Save preprocessed dataset
df_final.to_csv('final_travel_data.csv', index=False)
print("Final preprocessed dataset sample:")
print(df_final.head())

# Step 5: User Input and Recommendations
def recommend_destinations(user_input, top_n=5):
    """
    Recommends travel destinations based on user input.

    Args:
        user_input (dict): A dictionary containing user attributes.
        top_n (int): Number of top recommendations to return.

    Returns:
        pd.DataFrame: Top recommended destinations.
    """
    # Create a DataFrame for user input
    user_df = pd.DataFrame([user_input])

    # Preprocess user input (match preprocessing steps)
    user_df[numerical_columns] = scaler.transform(user_df[numerical_columns])
    user_encoded = encoder.transform(user_df[categorical_columns])
    user_encoded = pd.DataFrame(user_encoded, columns=encoded_columns)
    user_pca = pca.transform(user_encoded)

    # Combine user PCA features with numerical data
    user_features = pd.DataFrame(user_pca, columns=pca_columns)
    user_full = pd.concat([user_features, user_df[numerical_columns].reset_index(drop=True)], axis=1)

    # Compute cosine similarity
    similarity_scores = cosine_similarity(user_full, df_final[pca_columns + numerical_columns])
    df_final['similarity_score'] = similarity_scores[0]

    # Get top recommendations
    recommendations = df_final.sort_values(by='similarity_score', ascending=False).head(top_n)
    return recommendations[['destination_city', 'destination_country', 'similarity_score']]

# Example user input
user_input = {
    'age': 25, 
    'travel_rating': 4.5,
    'gender': 'Female',
    'hobby': 'Adventure',
    'budget': 'Mid-Range',
    'travel_style': 'Adventurous',
    'activities': 'Hiking',
    'climate': 'Cold'
}

# Get recommendations
top_recommendations = recommend_destinations(user_input)
print("Top Travel Recommendations:")
print(top_recommendations)


Final preprocessed dataset sample:
  destination_city destination_country       age  travel_rating     PCA_0  \
0             Rome               Italy -1.226765       0.419392  0.662318   
1          Bangkok            Thailand -0.602372      -0.593339  0.744092   
2            Cairo               Egypt  0.854544      -1.606069  0.678791   
3           Phuket            Thailand -0.706438       0.621938 -0.719790   
4             Lyon              France -0.706438       1.027030  0.732694   

      PCA_1     PCA_2     PCA_3     PCA_4     PCA_5  ...        PCA_40  \
0  0.775436 -0.114792  0.280262 -0.173265  0.038328  ...  1.967541e-15   
1 -0.506964  0.721521 -0.098211  0.735560 -0.130278  ...  7.272866e-16   
2  0.831508 -0.069022  0.218154  0.713622 -0.123107  ... -8.477061e-16   
3 -0.546588  1.200946 -0.582319 -0.329891  0.700320  ...  2.403817e-16   
4 -0.557609  0.083905  0.402533  0.749739 -0.097830  ...  1.236625e-15   

         PCA_41        PCA_42        PCA_43        PCA_44

In [4]:
df_final.shape

(10000, 55)