<a href="https://colab.research.google.com/github/Bhavadharani275/Mini_Project_4/blob/main/Tourism.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load a data

In [None]:
import pandas as pd

# Load all 9 Excel files
transactions = pd.read_excel("/content/drive/MyDrive/Transaction.xlsx")
item = pd.read_excel("/content/drive/MyDrive/Item.xlsx")
users = pd.read_excel("/content/drive/MyDrive/User.xlsx")
mode = pd.read_excel("/content/drive/MyDrive/Mode.xlsx")
continent = pd.read_excel("/content/drive/MyDrive/Continent.xlsx")
region = pd.read_excel("/content/drive/MyDrive/Region.xlsx")
country = pd.read_excel("/content/drive/MyDrive/Country.xlsx")
city = pd.read_excel("/content/drive/MyDrive/City.xlsx")
Type = pd.read_excel("/content/drive/MyDrive/Type.xlsx")

transactions.rename(columns={'VisitMode': 'VisitModeId'}, inplace=True)

# Merge step-by-step

# Join transactions with users
merged_df = transactions.merge(users, on='UserId', how='left')

# Join with mode
merged_df = merged_df.merge(mode, on='VisitModeId', how='left')

# Join with item
merged_df = merged_df.merge(item, on='AttractionId',  how='left')

# Join with city
merged_df = merged_df.merge(city[['CityId', 'CityName']], on='CityId', how='left')

# Join with country
merged_df = merged_df.merge(country[['CountryId', 'Country']], on='CountryId', how='left')

# Join with region
merged_df = merged_df.merge(region[['RegionId', 'Region']], on='RegionId', how='left')

#Join with continent
merged_df = merged_df.merge(continent[['ContinentId', 'Continent']], on='ContinentId', how='left')

# Sort the merged data by continent name (A ‚Üí Z)
merged_df = merged_df.sort_values(by='RegionId', ascending=True)

# (Optional) reset the index after sorting
merged_df.reset_index(drop=True, inplace=True)

# Save final dataset
merged_df.to_excel("Tourism_Master_Data.xlsx", index=False)

print("All 9 Excel files combined into one master file successfully!")


In [None]:
merged_df.head(5)

In [None]:
merged_df.shape

# cleaning process

In [None]:
duplicates_shows = merged_df[merged_df.duplicated()]
print(duplicates_shows)

In [None]:
merged_df.dtypes

In [None]:
merged_df.info()

In [None]:
# Print number of missing values
missing_value_counts=merged_df.isnull().sum()
missing_value_counts

In [None]:
# To see null row in id_method column
show=merged_df[merged_df['CityName'].isnull()]
show.head(10)

In [None]:
merged_df = merged_df.dropna(subset=['CityName'])

In [None]:
merged_df.info()

In [None]:
merged_df['CityName'].unique()

In [None]:
merged_df['CityName'].value_counts()

In [None]:
merged_df['Region'].unique()

In [None]:
merged_df = merged_df[merged_df['Region'] != '-']

In [None]:
merged_df['Region'].unique()

In [None]:
merged_df['Rating'].unique()

In [None]:
# Check min and max rating
print(merged_df['Rating'].min(), merged_df['Rating'].max())

# Check unique values for categorical columns
print(merged_df['VisitMode'].unique())


# Outliers detection

In [None]:
Q1 = merged_df['Rating'].quantile(0.25)
Q3 = merged_df['Rating'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter outliers
outliers = merged_df[(merged_df['Rating'] < lower_bound) | (merged_df['Rating'] > upper_bound)]
print(outliers)


In [None]:
# Outlier Detection in Ratings
import plotly.express as px
import plotly.graph_objects as go
fig = px.box(merged_df, y='Rating', title='Outlier Detection in Ratings')
fig.show()


In [None]:
# Outlier Detection in visit mode
fig = px.box(
    merged_df,
    x='VisitMode',
    y='Rating',
    title='Outlier Detection by Visit Mode',
    color='VisitMode'
)
fig.show()


In [None]:
# Rating Outliers Across Continents
fig = px.box(
    merged_df,
    x='Continent',
    y='Rating',
    color='Continent',
    title='Rating Outliers Across Continents'
)
fig.show()


# Aggregate user-level features

In [None]:
user_avg_rating = merged_df.groupby('UserId')['Rating'].mean()
user_avg_rating

In [None]:
# Average Rating Overall
user_avg_rating = merged_df.groupby('UserId')['Rating'].mean().reset_index()
user_avg_rating.rename(columns={'Rating': 'AvgRating'}, inplace=True)

In [None]:
# Total Number of Visits
user_total_visits = merged_df.groupby('UserId')['TransactionId'].count().reset_index()
user_total_visits.rename(columns={'TransactionId': 'TotalVisits'}, inplace=True)

In [None]:
# Number of Unique Attractions Visited
user_unique_attractions = merged_df.groupby('UserId')['AttractionId'].nunique().reset_index()
user_unique_attractions.rename(columns={'AttractionId': 'UniqueAttractions'}, inplace=True)

In [None]:
# Average Rating per Visit Mode (Pivot Table)
user_mode_avg = merged_df.pivot_table(
    index='UserId',
    columns='VisitMode',
    values='Rating',
    aggfunc='mean'
).reset_index()

# Optional: rename columns for clarity
user_mode_avg.columns = ['UserId'] + [f'AvgRating_{col}' for col in user_mode_avg.columns if col != 'UserId']

In [None]:
# merge all these aggregations into a single dataframe:
from functools import reduce

user_profiles = reduce(
    lambda left, right: pd.merge(left, right, on='UserId', how='left'),
    [user_avg_rating, user_total_visits, user_unique_attractions, user_mode_avg]
)

In [None]:
# Add Preferred Visit Mode
preferred_mode = (
    merged_df.groupby(['UserId', 'VisitMode']).size()
    .reset_index(name='VisitCount')
    .sort_values(['UserId', 'VisitCount'], ascending=[True, False])
    .drop_duplicates('UserId')
)
preferred_mode.rename(columns={'VisitMode': 'PreferredVisitMode'}, inplace=True)

# Merge it back
user_profiles = user_profiles.merge(preferred_mode[['UserId', 'PreferredVisitMode']], on='UserId', how='left')


In [None]:
user_profiles

In [None]:
# Average Rating per City
user_city_avg = merged_df.groupby(['UserId', 'CityName'])['Rating'].mean().reset_index()
user_city_avg.rename(columns={'Rating': 'AvgRating_City'}, inplace=True)


In [None]:
# Average Rating per AttractionTypeId
user_type_avg = merged_df.groupby(['UserId', 'AttractionTypeId'])['Rating'].mean().reset_index()
user_type_avg.rename(columns={'Rating': 'AvgRating_AttractionTypeId'}, inplace=True)


In [None]:
# Total Cities Visited & AttractionTypeId Explored
user_city_count = merged_df.groupby('UserId')['CityName'].nunique().reset_index()
user_city_count.rename(columns={'CityName': 'TotalCitiesVisited'}, inplace=True)

user_type_count = merged_df.groupby('UserId')['AttractionTypeId'].nunique().reset_index()
user_type_count.rename(columns={'AttractionTypeId': 'TotalAttractionTypesVisited'}, inplace=True)


In [None]:
# Most Visited City & AttractionTypeId
most_city = (
    merged_df.groupby(['UserId', 'CityName']).size()
    .reset_index(name='VisitCount')
    .sort_values(['UserId', 'VisitCount'], ascending=[True, False])
    .drop_duplicates('UserId')
)
most_city.rename(columns={'CityName': 'MostVisitedCity'}, inplace=True)

most_type = (
    merged_df.groupby(['UserId', 'AttractionTypeId']).size()
    .reset_index(name='VisitCount')
    .sort_values(['UserId', 'VisitCount'], ascending=[True, False])
    .drop_duplicates('UserId')
)
most_type.rename(columns={'AttractionTypeId': 'MostVisitedAttractionTypeId'}, inplace=True)


In [None]:
# Combine All User-Level Features
from functools import reduce

user_profile_extended = reduce(
    lambda left, right: pd.merge(left, right, on='UserId', how='left'),
    [user_city_count, user_type_count, most_city[['UserId', 'MostVisitedCity']], most_type[['UserId', 'MostVisitedAttractionTypeId']]]
)


In [None]:
user_profile_extended

In [None]:
avg_df = pd.merge(
    user_profiles,
    user_profile_extended,
    on='UserId',
    how='left'
)
avg_df.head()

# EDA

## Visualize User Distribution Across Continents, Countries, and Regions

In [None]:
import plotly.express as px

# Continent
continent_count = merged_df['Continent'].value_counts().reset_index()
continent_count.columns = ['Continent', 'UserCount']

fig = px.bar(continent_count,
             x='Continent',
             y='UserCount',
             title='User Distribution Across Continents',
             color='UserCount',
             text='UserCount')
fig.show()


In [None]:
# country
country_count = merged_df['Country'].value_counts().reset_index()
country_count.columns = ['Country', 'UserCount']

fig = px.bar(country_count.head(15),
             x='Country',
             y='UserCount',
             title='Top 15 Countries by User Visits',
             color='UserCount',
             text='UserCount')
fig.show()


In [None]:
# Region
Region_count = merged_df['Region'].value_counts().reset_index()
Region_count.columns = ['Region', 'UserCount']

fig = px.bar(Region_count.head(15),
             x='Region',
             y='UserCount',
             title='Top 15 Region by User Visits',
             color='UserCount',
             text='UserCount')
fig.show()


## Explore Attraction Types and Their Popularity Based on Ratings

In [None]:
attraction_popularity = merged_df.groupby('AttractionTypeId')['Rating'].agg(['count', 'mean']).reset_index()
attraction_popularity = attraction_popularity.sort_values(by='mean', ascending=False)

fig = px.bar(attraction_popularity,
             x='AttractionTypeId',
             y='mean',
             color='count',
             title='Attraction Type Popularity Based on Average Ratings',
             text='mean')
fig.update_traces(texttemplate='%{text:.2f}')
fig.show()


## Investigate Correlation Between VisitMode and User Demographics

In [None]:
visit_mode_region = merged_df.groupby(['VisitMode', 'Region']).size().reset_index(name='Count')

fig = px.sunburst(visit_mode_region,
                  path=['VisitMode', 'Region'],
                  values='Count',
                  title='Visit Mode vs Region Distribution')
fig.show()


In [None]:
import plotly.graph_objects as go
pivot = merged_df.pivot_table(index='VisitMode', columns='Continent', values='UserId', aggfunc='count', fill_value=0)

fig = go.Figure(data=go.Heatmap(
    z=pivot.values,
    x=pivot.columns,
    y=pivot.index,
    colorscale='Viridis'
))
fig.update_layout(title='Visit Mode vs Continent Heatmap')
fig.show()


## Analyze Distribution of Ratings Across Attractions and Regions

In [None]:
fig = px.box(merged_df,
             x='Region',
             y='Rating',
             title='Distribution of Ratings by Region',
             color='Region')
fig.show()


In [None]:
fig = px.box(merged_df,
             x='AttractionTypeId',
             y='Rating',
             title='Distribution of Ratings Across Attraction Types',
             color='AttractionTypeId')
fig.show()


## Correlation Heatmap

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

num_cols = ['Rating', 'VisitYear', 'VisitMonth']
corr = merged_df[num_cols].corr()

plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Between Numeric Variables')
plt.show()


## Rating Trends Over Time

In [None]:
import plotly.express as px

rating_trend = merged_df.groupby(['VisitYear', 'VisitMonth'])['Rating'].mean().reset_index()
rating_trend['VisitDate'] = rating_trend['VisitYear'].astype(str) + '-' + rating_trend['VisitMonth'].astype(str)

fig = px.line(rating_trend, x='VisitDate', y='Rating',
              title='Average Rating Trend Over Time',
              markers=True)
fig.show()


## Top Rated Attractions

In [None]:
top_attractions = merged_df.groupby('Attraction')['Rating'].mean().reset_index()
top_attractions = top_attractions.sort_values(by='Rating', ascending=False).head(10)

fig = px.bar(top_attractions, x='Attraction', y='Rating',
             title='Top 10 Highest Rated Attractions',
             text='Rating', color='Rating')
fig.update_traces(texttemplate='%{text:.2f}')
fig.show()


## Visit Mode Preference by Region


In [None]:
fig = px.histogram(merged_df,
                   x='VisitMode',
                   color='Region',
                   title='Visit Mode Preference Across Regions',
                   barmode='group')
fig.show()


## Geographic Analysis ‚Äî City Popularity Map

In [None]:
fig = px.scatter_geo(merged_df,
                     locations="Country",
                     locationmode='country names',
                     color="Rating",
                     hover_name="Attraction",
                     title="Average Attraction Ratings by Location")
fig.show()


## Correlation Between Numeric Variables

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

num_cols = ['Rating', 'VisitYear', 'VisitMonth']
corr = merged_df[num_cols].corr()

plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Between Numeric Variables')
plt.show()


In [None]:
merged_df.to_excel("Tourism_Data.xlsx", index=False)

In [None]:
tourism_df = merged_df.copy()

In [None]:
tourism_df.head(5)

# Objective

## 1. Regression: Predicting Attraction Ratings

In [None]:
attraction_avg = tourism_df.groupby('Attraction')['Rating'].mean().reset_index()
attraction_avg.rename(columns={'Rating': 'Attraction_AvgRating'}, inplace=True)
attraction_avg.head()

tourism_df = tourism_df.merge(attraction_avg, on='Attraction', how='left')

In [None]:
# Feature Selection
features = [
    'Continent', 'Region', 'Country', 'CityName',
    'VisitYear', 'VisitMonth','AttractionTypeId',
    'VisitMode'
]
target = 'Attraction_AvgRating'


In [None]:
tourism_df['VisitMode'].value_counts()

In [None]:
# Data Cleaning
tourism_df = tourism_df.dropna(subset=[target])
tourism_df.fillna('Unknown', inplace=True)


In [None]:
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

label_cols = ['Continent', 'Region', 'Country', 'CityName', 'VisitMode']
le = LabelEncoder()
for col in label_cols:
    tourism_df[col] = le.fit_transform(tourism_df[col])


In [None]:
# Split Data
from sklearn.model_selection import train_test_split

X = tourism_df[features]
y = tourism_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Train Regression Models

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

import pickle

with open("Linear_Regression.pkl", "wb") as f:
    pickle.dump(lr, f)


In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

with open("Random_Forest_Regression.pkl", "wb") as f:
    pickle.dump(rf, f)


### Evaluate Models

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate(y_true, y_pred):
    print(f"MAE: {mean_absolute_error(y_true, y_pred):.2f}")
    print(f"MSE: {mean_squared_error(y_true, y_pred):.2f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.2f}")
    print(f"R¬≤ Score: {r2_score(y_true, y_pred):.2f}")

print("üîπ Linear Regression Results")
evaluate(y_test, y_pred_lr)

print("\nüîπ Random Forest Results")
evaluate(y_test, y_pred_rf)


### Plot Actual vs Predicted values

In [None]:
import pandas as pd
import plotly.express as px

# Combine both model predictions
compare_df = pd.DataFrame({
    'Actual': y_test,
    'Linear Regression': y_pred_lr,
    'Random Forest': y_pred_rf
})

# Melt into long format for easy plotting
compare_df = compare_df.melt(id_vars=['Actual'],
                             value_vars=['Linear Regression', 'Random Forest'],
                             var_name='Model', value_name='Predicted')

# Create scatter plot
fig = px.scatter(
    compare_df,
    x='Actual',
    y='Predicted',
    color='Model',
    symbol='Model',
    title='Actual vs Predicted Ratings: Linear Regression vs Random Forest',
    color_discrete_map={'Linear Regression': 'blue', 'Random Forest': 'orange'}
)

# Add perfect prediction line (y = x)
min_val = compare_df[['Actual', 'Predicted']].min().min()
max_val = compare_df[['Actual', 'Predicted']].max().max()
fig.add_shape(
    type="line",
    x0=min_val, y0=min_val, x1=max_val, y1=max_val,
    line=dict(color='red', dash='dash'),
    xref='x', yref='y'
)

fig.update_layout(
    xaxis_title='Actual Ratings',
    yaxis_title='Predicted Ratings',
    template='plotly_white',
    width=800,
    height=600
)

fig.show()


## 2. Classification: User Visit Mode Prediction

In [None]:
# Features and Target
X = tourism_df[['Continent', 'Region', 'Country', 'CityName',
                'VisitYear', 'VisitMonth', 'AttractionTypeId']]
y = tourism_df['VisitMode']


In [None]:
tourism_df['VisitMode'].value_counts()

In [None]:
# Encode Categorical Variables
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in X.select_dtypes(include=['object']).columns:
    X[col] = le.fit_transform(X[col])

y = le.fit_transform(y)  # Encode target labels


In [None]:
# Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
# oversampling
from imblearn.over_sampling import SMOTE
from collections import Counter

print("Before SMOTE:", Counter(y_train))

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("After SMOTE:", Counter(y_train_res))


### Train Classification Models

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

rf_smote = RandomForestClassifier(random_state=42)
rf_smote.fit(X_train_res, y_train_res)

y_pred_rf = rf_smote.predict(X_test)

with open("Random_Forest_classification.pkl", "wb") as f:
    pickle.dump(rf_smote, f)


In [None]:
# XGBoost
from xgboost import XGBClassifier

# Train model
xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    random_state=42
)

xgb_model.fit(X_train_res, y_train_res)
y_pred_xgb = xgb_model.predict(X_test)

with open("XGBoost_classification.pkl", "wb") as f:
    pickle.dump(xgb_model, f)



In [None]:
# LightGBM
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=8,
    num_leaves=40,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42
)

lgbm_model.fit(X_train_res, y_train_res)
y_pred_lgbm = lgbm_model.predict(X_test)

with open("LightGBM_classification.pkl", "wb") as f:
    pickle.dump(lgbm_model, f)

### Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Random Forest Results
print("üîπ Random Forest Classifier")
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", rf_accuracy )
print(classification_report(y_test, y_pred_rf))

# XGBoost Results
print("\nüîπ XGBoost")
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
print("Accuracy:", xgb_accuracy)
print(classification_report(y_test, y_pred_xgb))

#LightGBM Results
print("\nüîπ LightGBM")
lgb_accuracy = accuracy_score(y_test, y_pred_lgbm)
print("Accuracy:", lgb_accuracy)
print(classification_report(y_test, y_pred_lgbm))

In [None]:
# Best and worst model
from sklearn.metrics import accuracy_score

# Store accuracies dynamically
results = {
    "Random Forest": accuracy_score(y_test, y_pred_rf),
    "XGBoost": accuracy_score(y_test, y_pred_xgb),
    "LightGBM": accuracy_score(y_test, y_pred_lgbm)
}

# Find best and worst models
best_model = max(results, key=results.get)
worst_model = min(results, key=results.get)

print(f"üèÜ Best Model: {best_model} (Accuracy: {results[best_model]:.4f})")
print(f"‚ö†Ô∏è Worst Model: {worst_model} (Accuracy: {results[worst_model]:.4f})")


#### Plot model evaluations

In [None]:
import plotly.graph_objects as go

# After all model evaluations
model_results = {
    "Random Forest": rf_accuracy,
    "XGBoost": xgb_accuracy,
    "LightGBM": lgb_accuracy
}

# Extract names and accuracy dynamically
models = list(model_results.keys())
accuracy = list(model_results.values())

# Create interactive Plotly bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    x=models,
    y=accuracy,
    text=[f"{a*100:.2f}%" for a in accuracy],
    textposition='auto',
    marker=dict(
        color=['#FF6F61', '#6B5B95', '#88B04B', '#45B8AC'],
        line=dict(color='black', width=1)
    ),
    hovertemplate="<b>%{x}</b><br>Accuracy: %{y:.3f}<extra></extra>"
))

fig.update_layout(
    title="Model Accuracy Comparison",
    xaxis_title="Model",
    yaxis_title="Accuracy",
    yaxis=dict(range=[0, 1]),
    template="plotly_white",
    font=dict(size=14),
    hoverlabel=dict(bgcolor="white", font_size=14, font_family="Arial")
)

fig.show()


### Plot Actual vs Predicted values

In [None]:
# Create DataFrame for all predictions
df_compare = pd.DataFrame({
    'Actual': y_test,
    'Random Forest': y_pred_rf,
    'XGBoost': y_pred_xgb,
    'LightGBM': y_pred_lgbm
})

# Prepare data for plot
model_names = ['Random Forest', 'XGBoost', 'LightGBM']
fig = go.Figure()

for model in model_names:
    pred_counts = df_compare[model].value_counts().sort_index()
    fig.add_trace(go.Bar(
        x=pred_counts.index.astype(str),
        y=pred_counts.values,
        name=model
    ))

fig.update_layout(
    title="üîπ Predicted Visit Mode Distribution Across Models",
    xaxis_title="Visit Mode",
    yaxis_title="Count",
    barmode='group',
    template="plotly_white",
    width=900,
    height=500
)

fig.show()


## 3. Recommendations: Personalized Attraction Suggestions

### K-means cluster


In [None]:
# Prepare the Data
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Example: select features
features = ['Continent', 'Region', 'Country', 'CityName', 'AttractionTypeId', 'Rating']

# Encode categorical features
encoder = LabelEncoder()
for col in ['Continent', 'Region', 'Country', 'CityName']:
    tourism_df[col] = encoder.fit_transform(tourism_df[col].astype(str))

# Scale numerical data
scaler = StandardScaler()
X = scaler.fit_transform(tourism_df[features])


In [None]:
# Apply K-Means Clustering
from sklearn.cluster import KMeans
import plotly.graph_objects as go

# Compute inertia for different K values
inertia = []
K = range(2, 10)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

with open("K_Means_Clustering.pkl", "wb") as f:
    pickle.dump(kmeans, f)

# Create interactive Plotly figure
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=list(K),
    y=inertia,
    mode='lines+markers',
    marker=dict(size=8, color='royalblue'),
    line=dict(width=2),
    name='Inertia'
))

fig.update_layout(
    title="üìâ Elbow Method for Optimal Number of Clusters (K)",
    xaxis_title="Number of Clusters (K)",
    yaxis_title="Inertia",
    template="plotly_white",
    hovermode="x unified"
)

fig.show()


In [None]:
# Train K-Means and Assign Clusters

kmeans = KMeans(n_clusters=4, random_state=42)
tourism_df['Cluster'] = kmeans.fit_predict(X)


In [None]:
# Explore Clusters
cluster_summary = tourism_df.groupby('Cluster')[['Rating']].mean()
print(cluster_summary)

# Count of users or attractions in each cluster
print(tourism_df['Cluster'].value_counts())

### Recommendations

In [None]:
# Find top attractions in each cluster
top_attractions = (
    tourism_df.groupby(['Cluster', 'Attraction'])
    ['Rating']
    .mean()
    .reset_index()
    .sort_values(['Cluster', 'Rating'], ascending=[True, False])
)

# Show top 5 per cluster
top_recommendations = top_attractions.groupby('Cluster').head(5)
print(top_recommendations)


In [None]:
import plotly.express as px

fig = px.scatter_3d(
    tourism_df,
    x='Continent', y='Region', z='Rating',
    color='Cluster',
    hover_data=['AttractionTypeId', 'Country', 'CityName'],
    title='User Clusters Based on Travel Behavior (K-Means)'
)
fig.show()


# Streamlit app

In [None]:
# %%writefile app.py

In [None]:
code = """
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pickle

# ===========================
# Load Data and Models
# ===========================
@st.cache_data
def load_data():
    return pd.read_excel("Tourism_Data.xlsx")

@st.cache_resource
def load_models():
    model_paths = {
        "RandomForest_Regression": "Random_Forest_Regression.pkl",
        "Linear_Regression": "Linear_Regression.pkl",
        "RandomForest_Classification": "Random_Forest_classification.pkl",
        "XGBoost_Classification": "XGBoost_classification.pkl",
        "LightGBM_Classification": "LightGBM_classification.pkl",
        "K_Means_Clustering": "K_Means_Clustering.pkl"
    }
    models = {}
    for name, path in model_paths.items():
        with open(path, "rb") as f:
            models[name] = pickle.load(f)
    return models


# ===========================
# Initialize
# ===========================
data = load_data()
models_dict = load_models()

# ===========================
# Streamlit Layout
# ===========================
st.set_page_config(page_title="üèùÔ∏è Tourism Analytics Dashboard", layout="wide")
st.title("üèùÔ∏è Tourism Experience Analytics Dashboard")

# ===========================
# Radio Button for Model Selection
# ===========================
st.sidebar.header("‚öôÔ∏è Model Selection")
model_type = st.sidebar.radio(
    "Select Dashboard Type",
    ["Regression", "Classification", "Recommendation"]
)

# ===========================
# Sidebar - Common Inputs
# ===========================
st.sidebar.header("User Input")

continent = st.sidebar.selectbox("üåç Continent", sorted(data['Continent'].unique()))
region = st.sidebar.selectbox("üìç Region", sorted(data['Region'].unique()))
country = st.sidebar.selectbox("üè≥Ô∏è Country", sorted(data['Country'].unique()))
city = st.sidebar.selectbox("üèôÔ∏è City", sorted(data['CityName'].unique()))
visit_year = st.sidebar.slider("üóìÔ∏è Visit Year", 2015, 2025, 2023)
visit_month = st.sidebar.slider("üìÜ Visit Month", 1, 12, 6)

# Hide Visit Mode for Classification
if model_type != "Classification":
    visit_mode = st.sidebar.selectbox("üõí Visit Mode", sorted(data['VisitMode'].unique()))
else:
    visit_mode = None

attraction = st.sidebar.selectbox("üé° Attraction", sorted(data['Attraction'].unique()))
selected_type_id = data.loc[data["Attraction"] == attraction, "AttractionTypeId"].values[0]

# Prepare encoded input
user_input = pd.DataFrame({
    'Continent': [continent],
    'Region': [region],
    'Country': [country],
    'CityName': [city],
    'VisitYear': [visit_year],
    'VisitMonth': [visit_month],
    'AttractionTypeId': [selected_type_id]
})

if 'Rating' not in user_input.columns:
    user_input['Rating'] = data['Rating'].mean()

# Add Attraction name for recommendation system
user_input["Attraction"] = attraction

# Add visit_mode only if available
if visit_mode is not None:
    user_input['VisitMode'] = [visit_mode]

# ===========================
# Encode categorical columns safely
# ===========================
from sklearn.preprocessing import LabelEncoder

# Define categorical columns dynamically
cat_cols = ['Continent', 'Region', 'Country', 'CityName', 'AttractionTypeId']
if visit_mode is not None:
    cat_cols.append('VisitMode')

# Create and store label encoders
encoded_data = data.copy()

encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    encoded_data[col] = le.fit_transform(encoded_data[col].astype(str))
    encoders[col] = le

# Transform user input safely (handle unseen labels)
for col in cat_cols:
    user_input[col] = user_input[col].apply(
        lambda x: encoders[col].transform([x])[0] if x in encoders[col].classes_ else -1
    )

# Removes extra columns not used during model training.
def safe_predict(model, user_input):
    try:
        model_features = model.feature_names_in_  # Works for sklearn>=1.0
        user_input = user_input[model_features]
    except AttributeError:
        # Fallback: older sklearn models might not have this attribute
        pass
    return model.predict(user_input)

# ===========================
# REGRESSION
# ===========================
if model_type == "Regression":
    st.subheader("üìà Regression Models")

    model_choice = st.radio(
        "Select Regression Model",
        ["RandomForest_Regression", "Linear_Regression"],
        horizontal=True
    )

    model = models_dict[model_choice]

    if st.button("üîÆ Predict Regression Output"):
        pred = safe_predict(model, user_input)[0]
        st.success(f"### üéØ Predicted Value ({model_choice}): {int(round(pred))}‚≠ê")

    # Visualizations
    st.subheader("üåÜ Tourism Trends Visualizations")
    top_regions = data.groupby("Region")["Rating"].mean().sort_values(ascending=False).head(10).reset_index()
    fig2 = px.bar(top_regions, x="Region", y="Rating", title="üèÖ Top Regions by Average Rating")
    st.plotly_chart(fig2, use_container_width=True)

# ===========================
# CLASSIFICATION
# ===========================
elif model_type == "Classification":
    st.subheader("ü§ñ Classification Models")

    model_choice = st.radio(
        "Select Classification Model",
        ["XGBoost_Classification", "LightGBM_Classification", "RandomForest_Classification"],
        horizontal=True
    )

    model = models_dict[model_choice]

    if st.button("üß≠ Predict Visit Mode"):
        prediction = safe_predict(model, user_input)[0]
        mode_map = {
            0: "üßç‚Äç‚ôÇÔ∏è Solo Traveler",
            1: "üë®‚Äçüë©‚Äçüëß‚Äçüë¶ Family Trip",
            2: "üßë‚Äçü§ù‚Äçüßë Friends Trip",
            3: "üíë Couple Trip",
            4: "üíº Business Traveler"
        }
        st.success(f"### üß≥ Predicted Visit Mode ({model_choice}): {mode_map.get(prediction, 'Unknown')}")

    # Visualizations
    st.subheader("üåç Region Insights")
    region_visit = data.groupby("Region")["UserId"].nunique().reset_index(name="VisitorCount")
    region_visit = region_visit.nlargest(10, "VisitorCount")
    region_visit = region_visit.sort_values(by="VisitorCount", ascending=False)
    fig2 = px.bar(region_visit, x="Region", y="VisitorCount", title="üë• Top Regions by Visitor Count")
    st.plotly_chart(fig2, use_container_width=True)

# ===========================
# RECOMMENDATION SYSTEM
# ===========================
else:
    st.subheader("üéØ Personalized Attraction Recommendation System")

    kmeans_model = models_dict["K_Means_Clustering"]
    scaler = StandardScaler()

    X_features = ['Continent', 'Region', 'Country', 'CityName', 'AttractionTypeId', 'Rating']

    # Make a copy for encoding
    encoded_data = data.copy()
    encoder_dict = {}

    # Encode all categorical columns safely
    for col in X_features:
        if encoded_data[col].dtype == 'object' or encoded_data[col].dtype == 'str':
            le = LabelEncoder()
            le.fit(encoded_data[col].astype(str))
            encoded_data[col] = le.transform(encoded_data[col].astype(str))
            encoder_dict[col] = le

            # Safely transform user input
            user_input[col] = user_input[col].astype(str)
            user_input[col] = user_input[col].apply(
                lambda x: x if x in le.classes_ else le.classes_[0]
            )
            # Extend encoder for unseen user labels
            le.classes_ = np.append(le.classes_, user_input[col][~user_input[col].isin(le.classes_)].unique())
            user_input[col] = le.transform(user_input[col])
        else:
            # If numeric, no encoding needed
            pass

    # Scale training and user data
    X_scaled = scaler.fit_transform(encoded_data[X_features])
    user_scaled = scaler.transform(user_input[X_features])

    # Predict user cluster
    user_cluster = kmeans_model.predict(user_scaled)[0]

    # Assign cluster labels to the main dataset
    encoded_data['PredictedCluster'] = kmeans_model.predict(X_scaled)
    data['PredictedCluster'] = encoded_data['PredictedCluster']

    # Generate recommendations from same cluster
    recommendations = (
        data[data['PredictedCluster'] == user_cluster]
        .groupby('Attraction')['Rating']
        .mean()
        .reset_index()
        .sort_values('Rating', ascending=False)
        .head(7)
    )

    recommendations = recommendations.reset_index(drop=True)
    recommendations.columns = ["üèñÔ∏è Attraction", "‚≠ê Average Rating"]

    st.write("üèùÔ∏è **Top Recommended Attractions for You:**")
    st.dataframe(recommendations)

    # ===========================
    # Tabs for Visualizations
    # ===========================
    tab1, tab2, tab3= st.tabs(["‚≠ê Top Attractions", "üèÖ Top Regions", "üß≠ User Segments"])

    with tab1:
        top_attractions = data.groupby('Attraction')['Rating'].mean().nlargest(10).reset_index()
        fig1 = px.bar(
            top_attractions, x='Attraction', y='Rating',
            title="‚≠ê Top Rated Attractions",
            text_auto='.2f'
        )
        st.plotly_chart(fig1, use_container_width=True)


    with tab2:
        region_rating = data.groupby('Region')['Rating'].mean().sort_values(ascending=False).head(10).reset_index()
        fig2 = px.bar(
          region_rating, x='Region', y='Rating',
          title="üèÖ Top Regions by Average Rating"
        )
        st.plotly_chart(fig2, use_container_width=True)


    with tab3:
        # Keep region names readable here
        fig3 = px.scatter_3d(
            data,
            x='Continent', y='Region', z='Rating',
            color='PredictedCluster',
            title='User Segments (K-Means Clusters)',
            hover_data=['Attraction', 'Country', 'CityName']
        )
        st.plotly_chart(fig3, use_container_width=True)


st.success("‚úÖ Dashboard loaded successfully!")


"""
with open("app.py", "w") as f:
    f.write(code)

In [None]:
!pip install streamlit pyngrok plotly

# Run

In [None]:
# Need to get ngrok authtoken to run the streamlit app in local so get the token form the ngrok site

from pyngrok import ngrok

# Open a tunnel to port 8501
public_url = ngrok.connect(8501)
print(public_url)

# Run your Streamlit app
!streamlit run app.py --server.port 8501 --server.address 0.0.0.0