# **Import Packages**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# **Data Extraction from Excel**

In [2]:
transactions = pd.read_excel("Transaction.xlsx")
users = pd.read_excel("User.xlsx")
cities = pd.read_excel("City.xlsx")
types = pd.read_excel("Type.xlsx")
visit_modes = pd.read_excel("Mode.xlsx")
continents = pd.read_excel("Continent.xlsx")
countries = pd.read_excel("Country.xlsx")
regions = pd.read_excel("Region.xlsx")
items = pd.read_excel("Item.xlsx")

# **Data Combining**

In [3]:
df = transactions.merge(users, on="UserId", how="left")

In [4]:
df = df.merge(items[["Attraction","AttractionId", "AttractionTypeId",]], on="AttractionId", how="left")
df

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,VisitModeId,AttractionId,Rating,ContinentId,RegionId,CountryId,CityId,Attraction,AttractionTypeId
0,3,70456,2022,10,2,640,5,5,21,163,4341.0,Sacred Monkey Forest Sanctuary,63
1,8,7567,2022,10,4,640,5,2,8,48,464.0,Sacred Monkey Forest Sanctuary,63
2,9,79069,2022,10,3,640,5,2,9,54,774.0,Sacred Monkey Forest Sanctuary,63
3,10,31019,2022,10,3,640,3,5,17,135,583.0,Sacred Monkey Forest Sanctuary,63
4,15,43611,2022,10,2,640,3,5,21,163,1396.0,Sacred Monkey Forest Sanctuary,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52925,211227,87100,2018,9,2,1297,4,5,21,159,7460.0,Yogyakarta Palace,44
52926,211238,88112,2016,2,2,1297,5,5,17,133,6164.0,Yogyakarta Palace,44
52927,211239,88112,2016,2,2,1297,4,5,17,133,6164.0,Yogyakarta Palace,44
52928,211240,88112,2016,2,2,1297,4,5,17,133,6164.0,Yogyakarta Palace,44


In [5]:
df["Visit_YearMonth"] = df["VisitYear"] * 100 + df["VisitMonth"]

In [6]:
df.isnull().sum()

TransactionId       0
UserId              0
VisitYear           0
VisitMonth          0
VisitModeId         0
AttractionId        0
Rating              0
ContinentId         0
RegionId            0
CountryId           0
CityId              8
Attraction          0
AttractionTypeId    0
Visit_YearMonth     0
dtype: int64

In [7]:
df.head()

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,VisitModeId,AttractionId,Rating,ContinentId,RegionId,CountryId,CityId,Attraction,AttractionTypeId,Visit_YearMonth
0,3,70456,2022,10,2,640,5,5,21,163,4341.0,Sacred Monkey Forest Sanctuary,63,202210
1,8,7567,2022,10,4,640,5,2,8,48,464.0,Sacred Monkey Forest Sanctuary,63,202210
2,9,79069,2022,10,3,640,5,2,9,54,774.0,Sacred Monkey Forest Sanctuary,63,202210
3,10,31019,2022,10,3,640,3,5,17,135,583.0,Sacred Monkey Forest Sanctuary,63,202210
4,15,43611,2022,10,2,640,3,5,21,163,1396.0,Sacred Monkey Forest Sanctuary,63,202210


In [8]:
df = df.merge(continents, on="ContinentId", how="left")

In [9]:
df = df.merge(regions[['RegionId', 'Region']], on='RegionId', how='left')

In [10]:
df = df.merge(countries[['CountryId', 'Country']], on='CountryId', how='left')

In [11]:
df = df.merge(cities[['CityId', 'City']], on='CityId', how='left').drop_duplicates()

In [12]:
df = df.merge(types, on="AttractionTypeId", how="left")

In [13]:
df = df.merge(visit_modes, on="VisitModeId", how="left")

In [14]:
df.head()

Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,VisitModeId,AttractionId,Rating,ContinentId,RegionId,CountryId,CityId,Attraction,AttractionTypeId,Visit_YearMonth,Continent,Region,Country,City,AttractionType,VisitMode
0,3,70456,2022,10,2,640,5,5,21,163,4341.0,Sacred Monkey Forest Sanctuary,63,202210,Europe,Western Europe,United Kingdom,Guildford,Nature & Wildlife Areas,Couples
1,8,7567,2022,10,4,640,5,2,8,48,464.0,Sacred Monkey Forest Sanctuary,63,202210,America,Northern America,Canada,Ontario,Nature & Wildlife Areas,Friends
2,9,79069,2022,10,3,640,5,2,9,54,774.0,Sacred Monkey Forest Sanctuary,63,202210,America,South America,Brazil,Brazil,Nature & Wildlife Areas,Family
3,10,31019,2022,10,3,640,3,5,17,135,583.0,Sacred Monkey Forest Sanctuary,63,202210,Europe,Central Europe,Switzerland,Zurich,Nature & Wildlife Areas,Family
4,15,43611,2022,10,2,640,3,5,21,163,1396.0,Sacred Monkey Forest Sanctuary,63,202210,Europe,Western Europe,United Kingdom,Manchester,Nature & Wildlife Areas,Couples


# **Data Preprocessing**

In [15]:
custom_order = ['TransactionId','UserId','ContinentId','Continent','RegionId','Region','CountryId','Country','CityId','City','AttractionId','Attraction','AttractionTypeId','AttractionType','VisitYear','VisitMonth','Visit_YearMonth','VisitModeId','VisitMode','Rating']
df = df[custom_order + [col for col in df.columns if col not in custom_order]].reset_index(drop=True)

In [16]:
df.head()

Unnamed: 0,TransactionId,UserId,ContinentId,Continent,RegionId,Region,CountryId,Country,CityId,City,AttractionId,Attraction,AttractionTypeId,AttractionType,VisitYear,VisitMonth,Visit_YearMonth,VisitModeId,VisitMode,Rating
0,3,70456,5,Europe,21,Western Europe,163,United Kingdom,4341.0,Guildford,640,Sacred Monkey Forest Sanctuary,63,Nature & Wildlife Areas,2022,10,202210,2,Couples,5
1,8,7567,2,America,8,Northern America,48,Canada,464.0,Ontario,640,Sacred Monkey Forest Sanctuary,63,Nature & Wildlife Areas,2022,10,202210,4,Friends,5
2,9,79069,2,America,9,South America,54,Brazil,774.0,Brazil,640,Sacred Monkey Forest Sanctuary,63,Nature & Wildlife Areas,2022,10,202210,3,Family,5
3,10,31019,5,Europe,17,Central Europe,135,Switzerland,583.0,Zurich,640,Sacred Monkey Forest Sanctuary,63,Nature & Wildlife Areas,2022,10,202210,3,Family,3
4,15,43611,5,Europe,21,Western Europe,163,United Kingdom,1396.0,Manchester,640,Sacred Monkey Forest Sanctuary,63,Nature & Wildlife Areas,2022,10,202210,2,Couples,3


In [17]:
mode_pair = (
    df.groupby('Country')[['City', 'CityId']]
    .apply(lambda x: x.mode().iloc[0] if not x.mode().empty else pd.Series([None, None], index=['City', 'CityId']))
)
df = df.merge(mode_pair, on='Country', suffixes=('', '_Mode'))

df['City'] = df['City'].fillna(df['City_Mode'])
df['CityId'] = df['CityId'].fillna(df['CityId_Mode'])

df.drop(columns=['City_Mode', 'CityId_Mode'], inplace=True)


In [18]:
df['Region'] = df['Region'].fillna('Undefined')

In [19]:
df.isnull().sum()

TransactionId       0
UserId              0
ContinentId         0
Continent           0
RegionId            0
Region              0
CountryId           0
Country             0
CityId              0
City                0
AttractionId        0
Attraction          0
AttractionTypeId    0
AttractionType      0
VisitYear           0
VisitMonth          0
Visit_YearMonth     0
VisitModeId         0
VisitMode           0
Rating              0
dtype: int64

# **Feature Engineering**

In [20]:
user_stats = df.groupby('UserId')['Rating'].agg(['mean', 'count']).rename(columns={
    'mean': 'user_avg_rating',
    'count': 'user_rating_count'
})
df = df.merge(user_stats, on='UserId', how='left')

attraction_stats = df.groupby('AttractionId')['Rating'].agg(['mean', 'count']).rename(columns={
    'mean': 'attraction_avg_rating',
    'count': 'attraction_visit_count'
})
df = df.merge(attraction_stats, on='AttractionId', how='left')

city_stats = df.groupby('CityId')['Rating'].agg(['mean', 'count']).rename(columns={
    'mean': 'city_avg_rating',
    'count': 'city_visit_count'
})
df = df.merge(city_stats, on='CityId', how='left')


In [21]:
# Total visits per user
user_visits = df.groupby('UserId')['TransactionId'].count().rename('user_total_visits')
df = df.merge(user_visits, on='UserId')

# User preferred attraction type (most frequent)
user_pref_type = df.groupby(['UserId', 'AttractionTypeId']).size().reset_index(name='count')
user_pref_type = user_pref_type.loc[user_pref_type.groupby('UserId')['count'].idxmax()][['UserId', 'AttractionTypeId']]
user_pref_type = user_pref_type.rename(columns={'AttractionTypeId': 'user_preferred_attraction_type'})
df = df.merge(user_pref_type, on='UserId')

In [22]:
# Cyclic encode VisitMonth
df['visit_month_sin'] = np.sin(2 * np.pi * df['VisitMonth']/12)
df['visit_month_cos'] = np.cos(2 * np.pi * df['VisitMonth']/12)
    
# Ratio feature: attraction visits relative to city visits
df['attraction_visit_ratio'] = df['attraction_visit_count'] / df['city_visit_count']

# Engagement ratio
df['month_ratio'] = df['visit_month_sin'] * df['attraction_visit_ratio']
df['month_city'] = df['visit_month_cos'] * df['city_visit_count']
df['rating_ratio'] = df['user_avg_rating'] * df['attraction_avg_rating']

In [23]:
# Sort by available chronological columns
df = df.sort_values(by=["UserId", "VisitYear", "TransactionId"])

# Attraction historical average
df["Attraction_Avg_Rating_Hist"] = (
    df.groupby("AttractionId")["Rating"]
      .transform(lambda x: x.expanding().mean().shift())
)

# User's historical average for this attraction type
df["User_Type_Avg_Rating_Hist"] = (
    df.groupby(["UserId", "AttractionTypeId"])["Rating"]
      .transform(lambda x: x.expanding().mean().shift())
)

# Fill with global average
global_avg = df["Rating"].mean()
df["Attraction_Avg_Rating_Hist"] = df["Attraction_Avg_Rating_Hist"].fillna(global_avg)
df["User_Type_Avg_Rating_Hist"] = df["User_Type_Avg_Rating_Hist"].fillna(global_avg)


In [24]:
df

Unnamed: 0,TransactionId,UserId,ContinentId,Continent,RegionId,Region,CountryId,Country,CityId,City,...,user_total_visits,user_preferred_attraction_type,visit_month_sin,visit_month_cos,attraction_visit_ratio,month_ratio,month_city,rating_ratio,Attraction_Avg_Rating_Hist,User_Type_Avg_Rating_Hist
3820,5661,14,5,Europe,20,Southern Europe,155,Portugal,220.0,Lagos,...,3,72,-2.449294e-16,1.000000e+00,1466.444444,-3.591753e-13,9.000000e+00,19.913068,4.157699,4.157699
31972,67652,14,5,Europe,20,Southern Europe,155,Portugal,220.0,Lagos,...,3,72,-2.449294e-16,1.000000e+00,646.111111,-1.582516e-13,9.000000e+00,19.401777,4.157699,4.157699
32777,68777,14,5,Europe,20,Southern Europe,155,Portugal,220.0,Lagos,...,3,72,-2.449294e-16,1.000000e+00,646.111111,-1.582516e-13,9.000000e+00,19.401777,5.000000,5.000000
8330,12109,16,3,Asia,14,South East Asia,101,Indonesia,3098.0,Jakarta,...,10,63,-2.449294e-16,1.000000e+00,10.231008,-2.505874e-15,1.290000e+03,20.055304,4.000000,4.157699
9702,14015,16,3,Asia,14,South East Asia,101,Indonesia,3098.0,Jakarta,...,10,63,-8.660254e-01,-5.000000e-01,10.231008,-8.860313e+00,-6.450000e+02,20.055304,3.500000,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44961,147511,88185,3,Asia,12,Middle East,80,Jordan,2534.0,Amman,...,5,76,5.000000e-01,-8.660254e-01,76.805556,3.840278e+01,-3.117691e+01,14.343797,3.414978,4.157699
29117,49105,88187,3,Asia,12,Middle East,88,Turkey,2604.0,Istanbul,...,1,76,-1.000000e+00,-1.836970e-16,45.391892,-4.539189e+01,-1.359358e-14,21.097053,4.219178,4.157699
12604,18240,88189,5,Europe,17,Central Europe,131,Romania,6129.0,Bucharest,...,2,13,1.224647e-16,-1.000000e+00,131.980000,1.616289e-14,-1.000000e+02,14.934801,4.267106,4.157699
24811,40525,88189,5,Europe,17,Central Europe,131,Romania,6129.0,Bucharest,...,2,13,1.224647e-16,-1.000000e+00,30.440000,3.727825e-15,-1.000000e+02,13.917214,3.976668,4.157699


In [25]:
df = df[~((df["City"] == df["Country"]) & (df["City"] != "Singapore"))]

In [26]:
print("Rows after drop:", len(df))

Rows after drop: 51712


In [27]:
df1=df.copy()
df1.head()

Unnamed: 0,TransactionId,UserId,ContinentId,Continent,RegionId,Region,CountryId,Country,CityId,City,...,user_total_visits,user_preferred_attraction_type,visit_month_sin,visit_month_cos,attraction_visit_ratio,month_ratio,month_city,rating_ratio,Attraction_Avg_Rating_Hist,User_Type_Avg_Rating_Hist
3820,5661,14,5,Europe,20,Southern Europe,155,Portugal,220.0,Lagos,...,3,72,-2.449294e-16,1.0,1466.444444,-3.591753e-13,9.0,19.913068,4.157699,4.157699
31972,67652,14,5,Europe,20,Southern Europe,155,Portugal,220.0,Lagos,...,3,72,-2.449294e-16,1.0,646.111111,-1.582516e-13,9.0,19.401777,4.157699,4.157699
32777,68777,14,5,Europe,20,Southern Europe,155,Portugal,220.0,Lagos,...,3,72,-2.449294e-16,1.0,646.111111,-1.582516e-13,9.0,19.401777,5.0,5.0
8330,12109,16,3,Asia,14,South East Asia,101,Indonesia,3098.0,Jakarta,...,10,63,-2.449294e-16,1.0,10.231008,-2.505874e-15,1290.0,20.055304,4.0,4.157699
9702,14015,16,3,Asia,14,South East Asia,101,Indonesia,3098.0,Jakarta,...,10,63,-0.8660254,-0.5,10.231008,-8.860313,-645.0,20.055304,3.5,3.0


In [28]:
df=df.drop([
    "TransactionId",
    "UserId",
    "Continent",
    "Region",
    "Country",
    "City",
    "Attraction",
    "AttractionType",
    "VisitYear",
    "VisitMonth",
    "VisitMode"
]
, axis=1)
df

Unnamed: 0,ContinentId,RegionId,CountryId,CityId,AttractionId,AttractionTypeId,Visit_YearMonth,VisitModeId,Rating,user_avg_rating,...,user_total_visits,user_preferred_attraction_type,visit_month_sin,visit_month_cos,attraction_visit_ratio,month_ratio,month_city,rating_ratio,Attraction_Avg_Rating_Hist,User_Type_Avg_Rating_Hist
3820,5,20,155,220.0,640,63,201812,4,4,4.666667,...,3,72,-2.449294e-16,1.000000e+00,1466.444444,-3.591753e-13,9.000000e+00,19.913068,4.157699,4.157699
31972,5,20,155,220.0,748,72,201812,4,5,4.666667,...,3,72,-2.449294e-16,1.000000e+00,646.111111,-1.582516e-13,9.000000e+00,19.401777,4.157699,4.157699
32777,5,20,155,220.0,748,72,201812,4,5,4.666667,...,3,72,-2.449294e-16,1.000000e+00,646.111111,-1.582516e-13,9.000000e+00,19.401777,5.000000,5.000000
8330,3,14,101,3098.0,640,63,201512,2,3,4.700000,...,10,63,-2.449294e-16,1.000000e+00,10.231008,-2.505874e-15,1.290000e+03,20.055304,4.000000,4.157699
9702,3,14,101,3098.0,640,63,201508,4,4,4.700000,...,10,63,-8.660254e-01,-5.000000e-01,10.231008,-8.860313e+00,-6.450000e+02,20.055304,3.500000,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44961,3,12,80,2534.0,369,13,201605,2,4,4.200000,...,5,76,5.000000e-01,-8.660254e-01,76.805556,3.840278e+01,-3.117691e+01,14.343797,3.414978,4.157699
29117,3,12,88,2604.0,824,76,201609,2,5,5.000000,...,1,76,-1.000000e+00,-1.836970e-16,45.391892,-4.539189e+01,-1.359358e-14,21.097053,4.219178,4.157699
12604,5,17,131,6129.0,640,63,201906,2,4,3.500000,...,2,13,1.224647e-16,-1.000000e+00,131.980000,1.616289e-14,-1.000000e+02,14.934801,4.267106,4.157699
24811,5,17,131,6129.0,650,13,201906,2,3,3.500000,...,2,13,1.224647e-16,-1.000000e+00,30.440000,3.727825e-15,-1.000000e+02,13.917214,3.976668,4.157699


In [29]:
df.describe()

Unnamed: 0,ContinentId,RegionId,CountryId,CityId,AttractionId,Visit_YearMonth,VisitModeId,Rating,user_avg_rating,user_rating_count,...,user_total_visits,user_preferred_attraction_type,visit_month_sin,visit_month_cos,attraction_visit_ratio,month_ratio,month_city,rating_ratio,Attraction_Avg_Rating_Hist,User_Type_Avg_Rating_Hist
count,51712.0,51712.0,51712.0,51712.0,51712.0,51712.0,51712.0,51712.0,51712.0,51712.0,...,51712.0,51712.0,51712.0,51712.0,51712.0,51712.0,51712.0,51712.0,51712.0,51712.0
mean,3.591623,14.531211,106.302986,3389.225112,760.396291,201641.85309,2.945815,4.157894,4.157894,2.686494,...,2.686494,53.440246,-0.04704639,-0.01967124,729.736289,-48.37332,-12.89606,17.350609,4.162395,4.157378
std,1.074418,4.168512,35.705739,2323.6777,211.373871,172.786377,1.000533,0.970866,0.828955,3.49572,...,3.49572,28.999419,0.7059578,0.7064294,1977.6107,1524.692,731.1233,3.888608,0.302619,0.360837
min,1.0,1.0,1.0,1.0,369.0,201301.0,1.0,1.0,1.0,1.0,...,1.0,2.0,-1.0,-1.0,0.010127,-13198.0,-2765.0,3.41519,1.0,1.0
25%,3.0,13.0,93.0,1434.0,640.0,201509.0,2.0,4.0,4.0,1.0,...,1.0,13.0,-0.8660254,-0.8660254,3.947467,-16.62871,-49.36345,15.244253,3.997535,4.157699
50%,4.0,15.0,109.0,3156.0,737.0,201610.0,3.0,4.0,4.058824,2.0,...,2.0,63.0,-2.449294e-16,-1.83697e-16,38.421053,-1.28804e-15,-2.571758e-15,17.102662,4.211765,4.157699
75%,4.0,16.0,115.0,4940.0,841.0,201802.0,4.0,5.0,5.0,3.0,...,3.0,76.0,0.5,0.5,388.176471,7.179181,32.25945,20.787618,4.281366,4.157699
max,5.0,21.0,164.0,9142.0,1297.0,202210.0,5.0,5.0,5.0,59.0,...,59.0,93.0,1.0,1.0,13198.0,13198.0,2765.0,23.493976,5.0,5.0


In [30]:
df.isnull().sum()

ContinentId                       0
RegionId                          0
CountryId                         0
CityId                            0
AttractionId                      0
AttractionTypeId                  0
Visit_YearMonth                   0
VisitModeId                       0
Rating                            0
user_avg_rating                   0
user_rating_count                 0
attraction_avg_rating             0
attraction_visit_count            0
city_avg_rating                   0
city_visit_count                  0
user_total_visits                 0
user_preferred_attraction_type    0
visit_month_sin                   0
visit_month_cos                   0
attraction_visit_ratio            0
month_ratio                       0
month_city                        0
rating_ratio                      0
Attraction_Avg_Rating_Hist        0
User_Type_Avg_Rating_Hist         0
dtype: int64

In [31]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category').cat.codes

In [32]:
df.dtypes

ContinentId                         int64
RegionId                            int64
CountryId                           int64
CityId                            float64
AttractionId                        int64
AttractionTypeId                     int8
Visit_YearMonth                     int64
VisitModeId                         int64
Rating                              int64
user_avg_rating                   float64
user_rating_count                   int64
attraction_avg_rating             float64
attraction_visit_count              int64
city_avg_rating                   float64
city_visit_count                    int64
user_total_visits                   int64
user_preferred_attraction_type      int64
visit_month_sin                   float64
visit_month_cos                   float64
attraction_visit_ratio            float64
month_ratio                       float64
month_city                        float64
rating_ratio                      float64
Attraction_Avg_Rating_Hist        

# **MODEL TRAINING - Regression**

## **Linear Regression Model**

In [33]:
from sklearn.model_selection import train_test_split
import joblib

In [34]:
X = df.drop(columns=["Rating"])
y = df["Rating"]

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
from sklearn.linear_model import LinearRegression

In [37]:
model = LinearRegression()

In [38]:
model.fit(X_train, y_train)

In [39]:
y_pred = model.predict(X_test)

In [40]:
y_pred_series = pd.Series(y_pred, index=y_test.index)
comparison = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred_series
})

print(comparison.head())


       Actual  Predicted
33981       5   4.968065
3631        4   4.029312
824         4   4.056947
3997        5   5.018217
38622       4   3.863467


In [41]:
from sklearn.metrics import r2_score,mean_squared_error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.3f}")
print(f"R^2 Score: {r2:.2f}")

RMSE: 0.495
R^2 Score: 0.75


## **Random Forest Model**

In [42]:
from sklearn.ensemble import RandomForestRegressor

In [43]:
model1 = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)

In [44]:
model1.fit(X_train, y_train)

In [45]:
y1_pred = model1.predict(X_test)

In [46]:
y1_pred_series = pd.Series(y1_pred, index=y_test.index)
comparison = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred_series
})

print(comparison.head())


       Actual  Predicted
33981       5   4.968065
3631        4   4.029312
824         4   4.056947
3997        5   5.018217
38622       4   3.863467


In [47]:
joblib.dump(model1, 'regressor.pkl')

['regressor.pkl']

## **MODEL EVALUATION**

In [48]:
from sklearn.metrics import r2_score,mean_squared_error
mse = mean_squared_error(y_test, y1_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y1_pred)
print(f"RMSE: {rmse:.3f}")
print(f"R^2 Score: {r2:.2f}")

RMSE: 0.492
R^2 Score: 0.75


### **Feature Importance**

In [49]:
importances = model1.feature_importances_
features = X.columns

# Create a DataFrame for visualization
feat_imp_df = pd.DataFrame({"Feature": features, "Importance": importances})
feat_imp_df = feat_imp_df.sort_values(by="Importance", ascending=False)
feat_imp_df

Unnamed: 0,Feature,Importance
21,rating_ratio,0.575855
8,user_avg_rating,0.179293
22,Attraction_Avg_Rating_Hist,0.035484
19,month_ratio,0.021754
6,Visit_YearMonth,0.02172
23,User_Type_Avg_Rating_Hist,0.019249
20,month_city,0.019227
3,CityId,0.016222
12,city_avg_rating,0.01564
18,attraction_visit_ratio,0.01335


## **XG Boost Model**

In [50]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(
    n_estimators=200,   # start with 200 trees
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)

In [51]:
xgb_model.fit(X_train, y_train)

In [52]:
y_pred_xgb = xgb_model.predict(X_test)

## **MODEL EVALUATION**

In [53]:
from sklearn.metrics import r2_score,mean_squared_error
mse = mean_squared_error(y_test, y_pred_xgb)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_xgb)
print(f"RMSE: {rmse:.4f}")
print(f"R^2 Score: {r2:.2f}")

RMSE: 0.4741
R^2 Score: 0.77


### **Feature Importance**

In [54]:
importances = xgb_model.feature_importances_
features = X.columns

# Create a DataFrame for visualization
feat_imp_df = pd.DataFrame({"Feature": features, "Importance": importances})
feat_imp_df = feat_imp_df.sort_values(by="Importance", ascending=False)
feat_imp_df

Unnamed: 0,Feature,Importance
21,rating_ratio,0.673192
8,user_avg_rating,0.167495
15,user_preferred_attraction_type,0.015143
23,User_Type_Avg_Rating_Hist,0.012987
11,attraction_visit_count,0.011721
10,attraction_avg_rating,0.011171
9,user_rating_count,0.010412
5,AttractionTypeId,0.008165
19,month_ratio,0.007656
4,AttractionId,0.007519


# **MODEL TRAINING-Classification**

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
X1 = df.drop(columns=['VisitModeId', 'Rating'])
y1 = df['VisitModeId']

In [57]:
from imblearn.over_sampling import SMOTE

In [58]:
smote = SMOTE(random_state=42)
X1_res, y1_res = smote.fit_resample(X1, y1)

print("Class distribution before SMOTE:")
print(y.value_counts())
print("\nClass distribution after SMOTE:")
print(pd.Series(y1_res).value_counts())

Class distribution before SMOTE:
Rating
5    23402
4    17534
3     7550
2     1991
1     1235
Name: count, dtype: int64

Class distribution after SMOTE:
VisitModeId
4    21089
2    21089
3    21089
5    21089
1    21089
Name: count, dtype: int64


In [59]:
X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1_res, y1_res, test_size=0.25, random_state=42, stratify=y1_res)

## **Logistics Regression Model**

In [60]:
from sklearn.linear_model import LogisticRegression

In [61]:
model2 = LogisticRegression(solver='saga', max_iter=10000)
model2.fit(X1_train, y1_train)

In [62]:
y2_pred = model2.predict(X1_test)

In [63]:
y2_pred_series = pd.Series(y2_pred, index=y1_test.index)
comparison = pd.DataFrame({
    'Actual': y1_test,
    'Predicted': y2_pred_series
})

print(comparison.tail())


       Actual  Predicted
84315       4          5
2962        3          2
2632        5          3
26001       4          1
39034       2          3


In [64]:
from sklearn.metrics import confusion_matrix, accuracy_score,precision_score,recall_score
cm=confusion_matrix(y1_test,y2_pred)
cm

array([[3354, 1013,  410,   41,  455],
       [1356, 1980, 1020,   74,  842],
       [1298, 1292, 1859,  131,  692],
       [1988, 1394, 1085,   74,  732],
       [1997, 1559,  691,   41,  984]])

In [65]:
print(accuracy_score(y1_test,y2_pred))

0.3129883923829755


## **Random Forest Classifier Model**

In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [67]:
model3 = RandomForestClassifier(
    n_estimators=700,                
    max_depth=25,                   
    min_samples_split=5,             
    min_samples_leaf=1,              
    max_features=0.5,                
    class_weight='balanced_subsample',  # Weights computed per bootstrap sample
    random_state=42,                 
    n_jobs=-1                       
)

In [68]:
model3.fit(X1_train, y1_train)

In [69]:
y3_pred = model3.predict(X1_test)

In [70]:
y3_pred_series = pd.Series(y3_pred, index=y1_test.index)

# Now you can do:
comparison = pd.DataFrame({
    'Actual': y1_test,
    'Predicted': y3_pred_series
})

print(comparison.tail())


       Actual  Predicted
84315       4          4
2962        3          3
2632        5          2
26001       4          4
39034       2          2


In [71]:
joblib.dump(model1, 'classification.pkl')

['classification.pkl']

## **MODEL EVALUATION**

In [73]:
from sklearn.metrics import confusion_matrix, accuracy_score,precision_score,recall_score
cm=confusion_matrix(y1_test,y3_pred)

In [74]:
cm

array([[5204,   20,   14,   17,   18],
       [  36, 3822,  770,  492,  152],
       [  86, 1195, 3096,  523,  372],
       [ 122, 1008,  545, 2987,  611],
       [ 100,  399,  157,  245, 4371]])

In [75]:
print(accuracy_score(y1_test,y3_pred))

0.7389424171155451


In [76]:
from sklearn.metrics import precision_score

# For multiclass precision
print("Precision (macro):", precision_score(y1_test, y3_pred, average='macro'))
print("Precision (micro):", precision_score(y1_test, y3_pred, average='micro'))
print("Precision (weighted):", precision_score(y1_test, y3_pred, average='weighted'))


Precision (macro): 0.739716680552388
Precision (micro): 0.7389424171155451
Precision (weighted): 0.7397227149299191


In [77]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Calculate precision, recall, and F1-score with different averaging methods
precision_macro = precision_score(y1_test, y3_pred, average='macro')
recall_macro = recall_score(y1_test, y3_pred, average='macro')
f1_macro = f1_score(y1_test, y3_pred, average='macro')

print(f"Precision (macro-average): {precision_macro:.4f}")
print(f"Recall (macro-average):    {recall_macro:.4f}")
print(f"F1-score (macro-average):  {f1_macro:.4f}")

# Weighted average (accounts for class imbalance)
precision_weighted = precision_score(y1_test, y3_pred, average='weighted')
recall_weighted = recall_score(y1_test, y3_pred, average='weighted')
f1_weighted = f1_score(y1_test, y3_pred, average='weighted')

print(f"\nPrecision (weighted-average): {precision_weighted:.4f}")
print(f"Recall (weighted-average):    {recall_weighted:.4f}")
print(f"F1-score (weighted-average):  {f1_weighted:.4f}")

# Full classification report (precision, recall, f1-score per class)
print("\nClassification Report:\n")
print(classification_report(y1_test, y3_pred))


Precision (macro-average): 0.7397
Recall (macro-average):    0.7389
F1-score (macro-average):  0.7358

Precision (weighted-average): 0.7397
Recall (weighted-average):    0.7389
F1-score (weighted-average):  0.7358

Classification Report:

              precision    recall  f1-score   support

           1       0.94      0.99      0.96      5273
           2       0.59      0.72      0.65      5272
           3       0.68      0.59      0.63      5272
           4       0.70      0.57      0.63      5273
           5       0.79      0.83      0.81      5272

    accuracy                           0.74     26362
   macro avg       0.74      0.74      0.74     26362
weighted avg       0.74      0.74      0.74     26362



In [78]:
from xgboost import XGBClassifier

In [79]:
X2 = df.drop(columns=['VisitModeId', 'Rating'])
y2 = df['VisitModeId'] - 1 

In [80]:
smote = SMOTE(random_state=42)
X2_res, y2_res = smote.fit_resample(X2, y2)

print("Class distribution before SMOTE:")
print(y2.value_counts())
print("\nClass distribution after SMOTE:")
print(pd.Series(y2_res).value_counts())

Class distribution before SMOTE:
VisitModeId
1    21089
2    14899
3    10689
4     4417
0      618
Name: count, dtype: int64

Class distribution after SMOTE:
VisitModeId
3    21089
1    21089
2    21089
4    21089
0    21089
Name: count, dtype: int64


In [81]:
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2_res, y2_res, test_size=0.25, random_state=42, stratify=y2_res
)

In [82]:
xgbmodel2 = XGBClassifier(
    n_estimators=700,
    max_depth=25,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    random_state=42,
    n_jobs=-1
)

In [83]:
xgbmodel2.fit(X2_train, y2_train)

In [85]:
y4_pred = xgbmodel2.predict(X2_test) + 1
y2_test = y2_test + 1

In [86]:
print(classification_report(y2_test, y4_pred))

              precision    recall  f1-score   support

           1       0.97      0.99      0.98      5273
           2       0.60      0.72      0.66      5272
           3       0.69      0.62      0.66      5272
           4       0.73      0.64      0.69      5273
           5       0.85      0.85      0.85      5272

    accuracy                           0.77     26362
   macro avg       0.77      0.77      0.77     26362
weighted avg       0.77      0.77      0.77     26362



# **Recommendation Task**

In [93]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [94]:
df1 = df1[['UserId', 'AttractionId', 'Attraction', 'AttractionType', 'City', 'Rating']].dropna()
df1['UserId'] = df1['UserId'].astype(int)
df1['AttractionId'] = df1['AttractionId'].astype(int)

In [95]:
cf_matrix = df1.pivot_table(index='UserId', columns='AttractionId', values='Rating', aggfunc='mean').fillna(0)

In [96]:
sparse_cf = csr_matrix(cf_matrix.values)

In [97]:
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(sparse_cf)

In [98]:
def recommend_cf(user_id, top_n=5):
    if user_id not in cf_matrix.index:
        return f"❌ User {user_id} not found in Collaborative Filtering data"
    
    # Get the index of the user in cf_matrix
    user_idx = cf_matrix.index.get_loc(user_id)
    
    # Find nearest neighbors
    distances, indices = model.kneighbors(sparse_cf[user_idx], n_neighbors=top_n+1)  # +1 because first neighbor is the user itself
    
    neighbors = cf_matrix.index[indices.flatten()[1:]]  # exclude the user itself
    sim_scores = 1 - distances.flatten()[1:]           # cosine similarity = 1 - distance
    
    # Weighted sum of ratings from neighbors
    weighted_scores = cf_matrix.loc[neighbors].T.dot(sim_scores) / sim_scores.sum()
    
    # Remove items already rated by the user
    already_rated = cf_matrix.loc[user_id][cf_matrix.loc[user_id] > 0].index
    recommendations = weighted_scores.drop(already_rated, errors='ignore').sort_values(ascending=False).head(top_n)
    
    return recommendations

In [99]:
def display_recommendations(series, df1):
    if isinstance(series, str):
        return series
    rec_df = series.reset_index()
    rec_df.columns = ['AttractionId', 'Score']
    merged = rec_df.merge(df1[['AttractionId', 'Attraction']].drop_duplicates(), on='AttractionId', how='left')
    result = merged[['Attraction', 'Score']].sort_values(by='Score', ascending=False).reset_index(drop=True)
    result.index += 1
    return result

In [100]:
user_input = input("🔢 Enter User ID: ")

try:
    user_id = int(user_input.strip())
    print(f"User ID entered: {user_id}")

    if user_id not in df1['UserId'].values:
        print(f"❌ User ID {user_id} not found in dataset.")
    else:
        print(f"Is user in CF matrix index? {user_id in cf_matrix.index}\n")

        try:
            print("📌 Collaborative Filtering Recommendations:")
            print(display_recommendations(recommend_cf(user_id), df1).to_string(index=True, index_names=False))

        except Exception as e:
            print(f"❌ An error occurred during recommendation: {e}")

except ValueError:
    print("❌ Please enter a valid numeric User ID.")

🔢 Enter User ID:  70456


User ID entered: 70456
Is user in CF matrix index? True

📌 Collaborative Filtering Recommendations:
          Attraction  Score
1  Kuta Beach - Bali    0.0
2     Nusa Dua Beach    0.0
3        Sanur Beach    0.0
4     Seminyak Beach    0.0
5   Tanah Lot Temple    0.0


In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [102]:
df1_unique = df1.drop_duplicates(subset='AttractionId').reset_index(drop=True)

In [103]:
df1_unique['combined_features'] = (
    df1_unique['AttractionType'].fillna('') + ' ' +
    df1_unique['City'].fillna('') + ' ' +
    df1_unique['Attraction'].fillna('')
)

In [104]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df1_unique['combined_features'])

In [105]:
from sklearn.neighbors import NearestNeighbors
nn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
nn_model.fit(tfidf_matrix)

In [106]:
def recommend_content(user_id, top_n=5):
    user_data = df1[df1['UserId'] == user_id]
    if user_data.empty:
        return "No data for this user."

    high_rated = user_data[user_data['Rating'] >= 4]
    if high_rated.empty:
        return "No high-rated attractions found for this user."

    visited_ids = set(user_data['AttractionId'])
    recommended = []

    for _, row in high_rated.iterrows():
        matches = df1_unique[df1_unique['AttractionId'] == row['AttractionId']]
        if matches.empty:
            continue
        idx = matches.index[0]

        distances, indices = nn_model.kneighbors(tfidf_matrix[idx], n_neighbors=top_n + 10)

        for i, dist in zip(indices[0], distances[0]):
            rec_row = df1_unique.iloc[i]
            aid = rec_row['AttractionId']

            if aid not in visited_ids and aid not in [r['AttractionId'] for r in recommended]:
                recommended.append({
                    'AttractionId': aid,
                    'Attraction': rec_row['Attraction'],
                    'AttractionType': rec_row['AttractionType'],
                    'City': rec_row['City'],
                    'SimilarityScore': round(1 - dist, 3)
                })

            if len(recommended) >= top_n:
                break
        if len(recommended) >= top_n:
            break

    recommendations_df = pd.DataFrame(recommended)
    if recommendations_df.empty:
        return "No recommendations found."

    recommendations_df.insert(0, 'Rank', range(1, len(recommendations_df) + 1))
    return recommendations_df


In [107]:
user_id = int(input("Enter User ID: "))
top_n = int(input("How many recommendations do you want? "))

# Generate recommendations
recommendations = recommend_content(user_id, top_n=top_n)

# Display output
if isinstance(recommendations, str):
    print(recommendations)  # In case of "No data" or "No high-rated attractions"
else:
    print(recommendations.to_string(index=False))


Enter User ID:  70456
How many recommendations do you want?  5


 Rank  AttractionId                         Attraction                 AttractionType     City  SimilarityScore
    1           975                       Sempu Island        Nature & Wildlife Areas Surabaya            0.384
    2           748            Tegalalang Rice Terrace Points of Interest & Landmarks    Lagos            0.131
    3           841                      Waterbom Bali                    Water Parks  Jakarta            0.000
    4           824                     Uluwatu Temple                Religious Sites  Jakarta            0.000
    5           888 Bromo Tengger Semeru National Park                 National Parks  Jakarta            0.000


In [108]:
def recommend_hybrid(user_id, top_n=5):
    # Get results from collaborative and content-based filtering
    cf_res = recommend_cf(user_id, top_n * 2)  # Get more to allow some overlap
    cb_res = recommend_content(user_id, top_n * 2)

    # Return error message if either is invalid (string)
    if isinstance(cf_res, str) or isinstance(cb_res, str):
        return "❌ Hybrid not possible due to missing data."

    # Convert content-based DataFrame to Series like CF
    cb_series = cb_res.set_index('AttractionId')['SimilarityScore']

    # Combine by summing scores, fill missing with 0
    combined = cf_res.add(cb_series, fill_value=0)

    # Return top-N
    return combined.sort_values(ascending=False).head(top_n)


In [109]:
def display_hybrid_recommendations(series, df1):
    if isinstance(series, str):
        return series

    rec_df = series.reset_index()
    rec_df.columns = ['AttractionId', 'Score']
    merged = rec_df.merge(df1[['AttractionId', 'Attraction']].drop_duplicates(), on='AttractionId', how='left')
    result = merged[['Attraction', 'Score']].sort_values(by='Score', ascending=False).reset_index(drop=True)
    result.index += 1
    return result

In [110]:
user_id = int(input("Enter User ID: "))
top_n = int(input("How many hybrid recommendations? "))

hybrid_recs = recommend_hybrid(user_id, top_n)

print("\n📌 Hybrid Recommendations:")
print(display_hybrid_recommendations(hybrid_recs, df1))

Enter User ID:  70456
How many hybrid recommendations?  5



📌 Hybrid Recommendations:
                Attraction  Score
1             Sempu Island  0.384
2  Tegalalang Rice Terrace  0.131
3        Kuta Beach - Bali  0.000
4           Nusa Dua Beach  0.000
5           Seminyak Beach  0.000
