# **Feature Engineering-**

In [20]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

master_df = pd.read_csv("../data/processed/master_cleaned.csv")

print(master_df.shape)
master_df.head()

(52890, 19)


Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,AttractionId,Rating,VisitMode,ContinentId,RegionId,CountryId,CityId,CityName,Country,Region,Continent,AttractionCityId,AttractionType,Attraction,AttractionAddress
0,3,70456,2022,10,640,5,Couples,5.0,21.0,163.0,4341.0,Guildford,United Kingdom,Western Europe,Europe,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia"
1,8,7567,2022,10,640,5,Friends,2.0,8.0,48.0,464.0,Ontario,Canada,Northern America,America,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia"
2,9,79069,2022,10,640,5,Family,2.0,9.0,54.0,774.0,Brazil,Brazil,South America,America,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia"
3,10,31019,2022,10,640,3,Family,5.0,17.0,135.0,583.0,Zurich,Switzerland,Central Europe,Europe,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia"
4,15,43611,2022,10,640,3,Couples,5.0,21.0,163.0,1396.0,Manchester,United Kingdom,Western Europe,Europe,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia"


Time-Based Features

In [21]:
# VisitDate
master_df['VisitDate'] = pd.to_datetime(
    master_df['VisitYear'].astype(str) + '-' +
    master_df['VisitMonth'].astype(str) + '-01'
)

In [22]:
# Quarter
master_df['Quarter'] = master_df['VisitDate'].dt.quarter

In [23]:
# Season
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

master_df['Season'] = master_df['VisitMonth'].apply(get_season)

User-Level Features

In [24]:
# Total Visits
user_visits = master_df.groupby('UserId')['TransactionId'].count().reset_index()
user_visits.columns = ['UserId', 'User_Total_Visits']

master_df = master_df.merge(user_visits, on='UserId', how='left')

In [25]:
# Average Rating
user_avg_rating = master_df.groupby('UserId')['Rating'].mean().reset_index()
user_avg_rating.columns = ['UserId', 'User_Avg_Rating']

master_df = master_df.merge(user_avg_rating, on='UserId', how='left')

Attraction-Level Features


In [26]:
# Attraction Popularity
attraction_popularity = master_df.groupby('AttractionId')['TransactionId'].count().reset_index()
attraction_popularity.columns = ['AttractionId', 'Attraction_Popularity']

master_df = master_df.merge(attraction_popularity, on='AttractionId', how='left')

In [27]:
# Attraction Average Rating
attraction_avg_rating = master_df.groupby('AttractionId')['Rating'].mean().reset_index()
attraction_avg_rating.columns = ['AttractionId', 'Attraction_Avg_Rating']

master_df = master_df.merge(attraction_avg_rating, on='AttractionId', how='left')

Behavioral Feature

In [28]:
# User Preferred Attraction Type
user_fav_type = (
    master_df.groupby(['UserId', 'AttractionType'])['TransactionId']
    .count()
    .reset_index()
)

user_fav_type = user_fav_type.loc[
    user_fav_type.groupby('UserId')['TransactionId'].idxmax()
]

user_fav_type = user_fav_type[['UserId', 'AttractionType']]
user_fav_type.columns = ['UserId', 'User_Favorite_Type']

master_df = master_df.merge(user_fav_type, on='UserId', how='left')

In [29]:
master_df.head()

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,AttractionId,Rating,VisitMode,ContinentId,RegionId,CountryId,CityId,CityName,Country,Region,Continent,AttractionCityId,AttractionType,Attraction,AttractionAddress,VisitDate,Quarter,Season,User_Total_Visits,User_Avg_Rating,Attraction_Popularity,Attraction_Avg_Rating,User_Favorite_Type
0,3,70456,2022,10,640,5,Couples,5.0,21.0,163.0,4341.0,Guildford,United Kingdom,Western Europe,Europe,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",2022-10-01,4,Autumn,1,5.0,13192,4.267207,Nature & Wildlife Areas
1,8,7567,2022,10,640,5,Friends,2.0,8.0,48.0,464.0,Ontario,Canada,Northern America,America,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",2022-10-01,4,Autumn,1,5.0,13192,4.267207,Nature & Wildlife Areas
2,9,79069,2022,10,640,5,Family,2.0,9.0,54.0,774.0,Brazil,Brazil,South America,America,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",2022-10-01,4,Autumn,1,5.0,13192,4.267207,Nature & Wildlife Areas
3,10,31019,2022,10,640,3,Family,5.0,17.0,135.0,583.0,Zurich,Switzerland,Central Europe,Europe,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",2022-10-01,4,Autumn,2,3.0,13192,4.267207,Nature & Wildlife Areas
4,15,43611,2022,10,640,3,Couples,5.0,21.0,163.0,1396.0,Manchester,United Kingdom,Western Europe,Europe,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia",2022-10-01,4,Autumn,3,3.0,13192,4.267207,Nature & Wildlife Areas


Encoding Categorical Variables

In [30]:
model_df = master_df.copy()

In [31]:
categorical_cols = [
    'VisitMode',
    'Season',
    'Continent',
    'AttractionType',
    'User_Favorite_Type'
]

model_df = pd.get_dummies(model_df, columns=categorical_cols, drop_first=True)

Select Numerical Features

In [32]:
numerical_cols = [
    'VisitMonth',
    'VisitYear',
    'User_Total_Visits',
    'User_Avg_Rating',
    'Attraction_Popularity',
    'Attraction_Avg_Rating'
]

Scaling (Normalization)

In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

model_df[numerical_cols] = scaler.fit_transform(model_df[numerical_cols])

Final Model Dataset

In [34]:
model_df = model_df.drop(columns=[
    'TransactionId',
    'UserId',
    'AttractionId',
    'VisitDate',
    'CityName',
    'Country',
    'Region',
    'Attraction',
    'AttractionAddress'
])

In [37]:
model_df.head()

Unnamed: 0,VisitYear,VisitMonth,Rating,ContinentId,RegionId,CountryId,CityId,AttractionCityId,Quarter,User_Total_Visits,User_Avg_Rating,Attraction_Popularity,Attraction_Avg_Rating,VisitMode_Couples,VisitMode_Family,VisitMode_Friends,VisitMode_Solo,Season_Spring,Season_Summer,Season_Winter,Continent_America,Continent_Asia,Continent_Australia & Oceania,Continent_Europe,AttractionType_Ballets,AttractionType_Beaches,AttractionType_Caverns & Caves,AttractionType_Flea & Street Markets,AttractionType_Historic Sites,AttractionType_History Museums,AttractionType_National Parks,AttractionType_Nature & Wildlife Areas,AttractionType_Neighborhoods,AttractionType_Points Of Interest & Landmarks,AttractionType_Religious Sites,AttractionType_Spas,AttractionType_Speciality Museums,AttractionType_Volcanos,AttractionType_Water Parks,AttractionType_Waterfalls,User_Favorite_Type_Ballets,User_Favorite_Type_Beaches,User_Favorite_Type_Caverns & Caves,User_Favorite_Type_Flea & Street Markets,User_Favorite_Type_Historic Sites,User_Favorite_Type_History Museums,User_Favorite_Type_National Parks,User_Favorite_Type_Nature & Wildlife Areas,User_Favorite_Type_Neighborhoods,User_Favorite_Type_Points Of Interest & Landmarks,User_Favorite_Type_Religious Sites,User_Favorite_Type_Spas,User_Favorite_Type_Speciality Museums,User_Favorite_Type_Volcanos,User_Favorite_Type_Water Parks,User_Favorite_Type_Waterfalls
0,3.258693,0.992612,5,5.0,21.0,163.0,4341.0,1,4,-0.484049,1.016263,1.613029,0.373286,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
1,3.258693,0.992612,5,2.0,8.0,48.0,464.0,1,4,-0.484049,1.016263,1.613029,0.373286,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
2,3.258693,0.992612,5,2.0,9.0,54.0,774.0,1,4,-0.484049,1.016263,1.613029,0.373286,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3,3.258693,0.992612,3,5.0,17.0,135.0,583.0,1,4,-0.195395,-1.397088,1.613029,0.373286,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
4,3.258693,0.992612,3,5.0,21.0,163.0,1396.0,1,4,0.09326,-1.397088,1.613029,0.373286,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False


In [38]:
print(model_df.shape)
print(model_df.isnull().sum().sum())

(52890, 56)
0


In [39]:
import os

# Create processed folder if not exists
os.makedirs("../data/processed", exist_ok=True)

# Save model-ready dataset
model_df.to_csv("../data/processed/model_ready.csv", index=False)

print("Saved successfully!")

Saved successfully!
