In [None]:
#Installing packages

!pip install scikit-surprise
# Uninstall current NumPy completely
!pip uninstall -y numpy
# Reinstall specific version
!pip install numpy==1.23.5 --force-reinstall
!pip install streamlit
!pip install pyngrok

# **Required Packages**

In [None]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import  DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sqlite3





# **Data collection**

In [6]:
# Reading data from drive

city_df = pd.read_excel('/content/drive/MyDrive/mini_project_4_db/City.xlsx')

continent_df = pd.read_excel("/content/drive/MyDrive/mini_project_4_db/Continent.xlsx")

country_df = pd.read_excel("/content/drive/MyDrive/mini_project_4_db/Country.xlsx")

item_df = pd.read_excel("/content/drive/MyDrive/mini_project_4_db/Updated_Item.xlsx")

mode_df = pd.read_excel("/content/drive/MyDrive/mini_project_4_db/Mode.xlsx")

region_df = pd.read_excel("/content/drive/MyDrive/mini_project_4_db/Region.xlsx")

transaction_df = pd.read_excel("/content/drive/MyDrive/mini_project_4_db/Transaction.xlsx")

type_df = pd.read_excel("/content/drive/MyDrive/mini_project_4_db/Type.xlsx")

user_df = pd.read_excel("/content/drive/MyDrive/mini_project_4_db/User.xlsx")


In [7]:
# Merging Dataframes

df_merged1 = pd.merge(transaction_df, user_df, on='UserId', how='inner')
df_merged2 = pd.merge(df_merged1, region_df[['Region','RegionId']], on='RegionId', how='inner')
df_merged3 = pd.merge(df_merged2, item_df, on='AttractionId', how='inner')
df_merged4 = pd.merge(df_merged3, country_df[['CountryId','Country']], on='CountryId', how='inner')
df_merged5 = pd.merge(df_merged4, continent_df, on='ContinentId', how='inner')
df_merged6 = pd.merge(df_merged5, city_df[['CityId','CityName']], on='CityId', how='inner')
df_merged7 = pd.merge(df_merged6, type_df, on='AttractionTypeId', how='inner')
df_merged7.rename(columns={'VisitMode': 'VisitModeId'}, inplace=True)
main_data = pd.merge(df_merged7, mode_df, on='VisitModeId', how='inner')

In [None]:
#Checking Na values

main_data.isna().sum()

In [None]:
main_data.describe()

In [None]:
main_data.info()

# **Transferring Data to SQL**

In [None]:
import sqlite3
conn = sqlite3.connect('my_database.db')  # .db file will be created
cursor = conn.cursor()
main_data.to_sql('tourist_data', conn, if_exists='append', index=False)

# **Pre-proccessig**

In [8]:
# Dropping unwanted columns

main_data_ml = main_data.drop('AttractionAddress',axis=1)

In [9]:
#Encoding columns with object datatype

categorical_cols = main_data_ml.select_dtypes(include='object').columns.tolist()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    main_data_ml[col] = le.fit_transform(main_data_ml[col])
    label_encoders[col] = le

In [None]:
# save the encoder

with open("label_encoder.pkl",'wb') as encoder_file:
  pickle.dump(label_encoders,encoder_file)

# **EDA**

In [None]:
#Distribution across continents, countries, and regions.

In [None]:
distribution_continent= main_data['Continent'].value_counts().reset_index(name='Visit_count')
fig = px.bar(distribution_continent,'Continent','Visit_count',title="Distribution across Continent",color='Continent')
fig.show()

In [None]:
distribution_country = main_data['Country'].value_counts().reset_index(name='Visit_count')
fig = px.bar(distribution_country,'Country','Visit_count',title="Distribution across country",color='Country')
fig.show()


In [None]:
distribution_region= main_data['Region'].value_counts().reset_index(name='Visit_count')
fig = px.bar(distribution_region,'Region','Visit_count',title="Distribution across Region",color='Region')
fig.show()

In [None]:
#attraction types and their popularity based on user ratings

popular_att = main_data[['AttractionType','Rating']]

In [None]:
pop_att = popular_att.groupby("AttractionType")['Rating'].value_counts().reset_index(name='No_Users')

In [None]:
rating = px.bar(pop_att,'AttractionType','No_Users',hover_data='Rating',color='Rating')
rating.show()

In [None]:
#Correlation

In [None]:
correlation = main_data_ml[['UserId','ContinentId','Continent', 'VisitYear', 'VisitMonth','RegionId','Region', 'AttractionCityId', 'AttractionTypeId',
                         'Attraction','AttractionType','CountryId','Country','CityId', 'CityName','VisitModeId','VisitMode','Rating']]


In [None]:
# Plot heatmap for correlation
plt.figure(figsize=(20,20))
sns.heatmap(correlation.corr(), annot=True,vmax=1,vmin=-1)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
#Distribution of ratings across different attractions and regions

In [None]:
att_reg_rat = main_data[['Attraction','Region','Rating']]


In [None]:
rating_filter = att_reg_rat.groupby(['Attraction','Region'])['Rating'].value_counts().reset_index(name='ratted_users').sort_values(by="ratted_users", ascending=False)

In [None]:
fig = px.bar(rating_filter,'Region','Rating',hover_data=['Attraction','ratted_users'],color='Rating')
fig.show()

# **Machine Learning**

# *Regression*

In [None]:
reg_mod_df = main_data_ml[[ 'VisitMonth', 'CityName','Attraction','VisitMode','Rating']]

In [None]:
# Features and target

x =reg_mod_df.drop(columns='Rating',axis=1)
y =reg_mod_df['Rating']

In [None]:
# Splitting data into train and test

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
#LGBMRegressor
lgm = LGBMRegressor()
lgm.fit(x_train,y_train)

In [None]:
#LinearRegression

lr=LinearRegression()
lr.fit(x_train,y_train)

In [None]:
#Ridge,Lasso

ridge_model = Ridge(alpha=1.0)
lasso_model = Lasso(alpha=1.0)
ridge_model.fit(x_train, y_train)
lasso_model.fit(x_train, y_train)

In [None]:
#KNeighborsRegressor

KNN=KNeighborsRegressor(n_neighbors=20)
KNN.fit(x_train,y_train)

In [None]:
#DecisionTreeRegressor model

dtr=DecisionTreeRegressor()
dtr.fit(x_train,y_train)

In [None]:
#RandomForestRegressor
rfc=RandomForestRegressor(n_estimators=1000, max_depth=100, min_samples_split=100, min_samples_leaf=20,  max_features='sqrt', bootstrap=True)
rfc.fit(x_train,y_train)

In [None]:
#Hyperparameter tuning

linear_param={'fit_intercept':[True,False]}

decision_param={"splitter":['best', 'random'],"max_depth":[None,5,10,15,20],"min_samples_split":[2,3,4,5,6,7,8,10]}

knn_param={'n_neighbors':[5,7,9,12,15,18,20],"weights":['uniform','distance']}

#Creating multiple models

grid_search_lr=GridSearchCV(lr,linear_param,cv=5)
grid_search_dtr=GridSearchCV(dtr,decision_param,cv=5)
grid_search_KNN=GridSearchCV(KNN,knn_param,cv=5)

#Training multiple models

grid_search_lr.fit(x_train,y_train)
grid_search_dtr.fit(x_train,y_train)
grid_search_KNN.fit(x_train,y_train)

In [None]:
# Evaluation metrics

y_pred_lr=lr.predict(x_test)
lr_score=lr.score(x_test,y_test)
linear_mae=mean_absolute_error(y_test,y_pred_lr)
r2_lr = r2_score(y_test,y_pred_lr)

y_pred_rm=ridge_model.predict(x_test)
rm_score=ridge_model.score(x_test,y_test)
rm_mae=mean_absolute_error(y_test,y_pred_rm)
r2_rm = r2_score(y_test,y_pred_rm)

y_pred_lm=lasso_model.predict(x_test)
lm_score=lasso_model.score(x_test,y_test)
lm_mae=mean_absolute_error(y_test,y_pred_lm)
r2_lm = r2_score(y_test,y_pred_lm)

y_pred_knn=KNN.predict(x_test)
KNN_score=KNN.score(x_test,y_test)
knn_mae=mean_absolute_error(y_test,y_pred_knn)
r2_knn = r2_score(y_test,y_pred_knn)

y_pred_dtr=dtr.predict(x_test)
dtr_score=dtr.score(x_test,y_test)
dtr_mae=mean_absolute_error(y_test,y_pred_dtr)
r2_dtr = r2_score(y_test,y_pred_dtr)

y_pred_rfc=rfc.predict(x_test)
rfc_score=rfc.score(x_test,y_test)
rfc_mae=mean_absolute_error(y_test,y_pred_rfc)
r2_rfc = r2_score(y_test,y_pred_rfc)

y_pref_lgm = lgm.predict(x_test)
lgm_score=lgm.score(x_test,y_test)
lgm_mae=mean_absolute_error(y_test,y_pref_lgm)
r2_lgm = r2_score(y_test,y_pref_lgm)

print(f"Linear regression \n r2 score : {r2_lr}\n mae : {linear_mae} \nRidge model \n r2 score : {r2_rm}\n mae : {rm_mae}  \nlasso_model \n r2 score : {r2_lm} \n mae : {lm_mae} \nKNN model \n r2 score : {r2_knn}\n mae : {knn_mae} \nDecision tree  \n r2 score : {r2_dtr}\n mae : {dtr_mae} \nRandom forest \n r2 score : {r2_rfc} \n mae : {rfc_mae} \nLGM \n r2 score : {r2_lgm}\n mae : {lgm_mae}")

In [None]:
#classification

log_param_reg={
            'C': [0.1,1, 10],
            'penalty': ['l1', 'l2']
        }

dtc_param_reg={
            'max_depth': [None,10, 20],
            'min_samples_split': [2, 5]
        }

knnc_param_reg={
            'n_neighbors': [5,10,20],
            'weights': ['uniform', 'distance']}
rfc_param_reg ={ 'n_estimators': [50,100,200],
            'max_depth': [None,10,20],
            "min_samples_split":[5,6,7,8,10]}
#Creating multiple models

re_log=LogisticRegression()
reg_dtc=DecisionTreeClassifier()
reg_knnc=KNeighborsClassifier()
reg_rfc=RandomForestClassifier()

grid_search_log_reg=GridSearchCV(re_log,log_param_reg,cv=5)
grid_search_dtc_reg=GridSearchCV(reg_dtc,dtc_param_reg,cv=5)
grid_search_KNNC_reg=GridSearchCV(reg_knnc,knnc_param_reg,cv=5)
grid_search_rfc_reg=GridSearchCV(reg_rfc,rfc_param_reg,cv=5)


#Training multiple models

grid_search_log_reg.fit(x_train,y_train)
grid_search_dtc_reg.fit(x_train,y_train)
grid_search_KNNC_reg.fit(x_train,y_train)
grid_search_rfc_reg.fit(x_train,y_train)

In [None]:
# Evaluation metrics

y_pred_log_reg=grid_search_log_reg.predict(x_test)
log_reg_rep=classification_report(y_test,y_pred_log_reg)


y_pred_knnc_reg=grid_search_KNNC_reg.predict(x_test)
knnc_reg_rep=classification_report(y_test,y_pred_knnc_reg)


y_pred_dtc_reg=grid_search_dtc_reg.predict(x_test)
dtc_reg_rep=classification_report(y_test,y_pred_dtc_reg)


y_pred_rfc_reg=grid_search_rfc_reg.predict(x_test)
rfclass_reg_rep=classification_report(y_test,y_pred_rfc_reg)

print(f"Logistic regression \n{log_reg_rep} \nKNN Classifier \n {knnc_reg_rep} \nDecision tree Classifier \n {dtc_reg_rep} \nRandom forest Classifier \n {rfclass_reg_rep} ")

In [None]:
#Finalized model for classification

Final_RFC_ratings=RandomForestClassifier(max_depth=10, min_samples_split=6, n_estimators=50)
Final_RFC_ratings.fit(x_train,y_train)

In [None]:
#Final report

y_pred_FRR=Final_RFC_ratings.predict(x_test)
print(classification_report(y_test,y_pred_FRR))

In [None]:
# Save the model

with open("RFC_Ratings.pkl",'wb') as RFC_file:
    pickle.dump(Final_RFC_ratings,RFC_file)

# *Classification*

In [None]:
class_data = main_data_ml[['VisitYear','VisitMonth','Rating','Attraction','AttractionType','CityName','Country','VisitMode']]


In [None]:
# Features and target

x_class = class_data.drop('VisitMode',axis=1)
y_class = class_data['VisitMode']

In [None]:
# Splitting data into train and test

x_train_class,x_test_class,y_train_class,y_test_class = train_test_split(x_class,y_class,test_size=0.2,random_state=42)

In [None]:
# Hyperparameter Tuning

log_param={
            'C': [0.1, 1, 10],
            'penalty': ['l1', 'l2']
        }

dtc_param={
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }

knnc_param={
            'n_neighbors': [3, 5, 7,10,12,15,20],
            'weights': ['uniform', 'distance']}
rfc_param ={ 'n_estimators': [50, 100],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]}
#Creating multiple models

g_log=LogisticRegression()
g_dtc=DecisionTreeClassifier()
g_knnc=KNeighborsClassifier()
g_rfc=RandomForestClassifier()

grid_search_log=GridSearchCV(g_log,log_param,cv=5)
grid_search_dtc=GridSearchCV(g_dtc,dtc_param,cv=5)
grid_search_KNNC=GridSearchCV(g_knnc,knnc_param,cv=5)
grid_search_rfc=GridSearchCV(g_rfc,rfc_param,cv=5)

#Training multiple models

grid_search_log.fit(x_train_class,y_train_class)
grid_search_dtc.fit(x_train_class,y_train_class)
grid_search_KNNC.fit(x_train_class,y_train_class)
grid_search_rfc.fit(x_train_class,y_train_class)

In [None]:
#KNeighborsClassifier
knn_class=KNeighborsClassifier(n_neighbors=20, weights='distance')
knn_class.fit(x_train_class,y_train_class)

In [None]:
#DecisionTreeClassifier

dtc=DecisionTreeClassifier(max_depth=10)
dtc.fit(x_train_class,y_train_class)

In [None]:
# RandomForestClassifier

gs_rfc=RandomForestClassifier(max_depth=10, min_samples_split=5, n_estimators=50)
gs_rfc.fit(x_train_class,y_train_class)

In [None]:
# Evaluation metrics

y_pred_log=log_reg.predict(x_test_class)
log_rep=classification_report(y_test_class,y_pred_log)


y_pred_knnc=knn_class.predict(x_test_class)
knnc_rep=classification_report(y_test_class,y_pred_knnc)


y_pred_dtc=dtc.predict(x_test_class)
dtc_rep=classification_report(y_test_class,y_pred_dtc)


y_pred_rfclass=gs_rfc.predict(x_test_class)
rfclass_rep=classification_report(y_test_class,y_pred_rfclass)

print(f"Logistic regression \n{log_rep} \nKNN Classifier \n {knnc_rep} \nDecision tree Classifier \n {dtc_rep} \nRandom forest Classifier \n {rfclass_rep} ")

In [None]:
#Finalized model

from sklearn.neighbors import KNeighborsClassifier
knn_class=KNeighborsClassifier(n_neighbors=20, weights='distance')
knn_class.fit(x_train_class,y_train_class)

In [None]:
# Save the model

with open("KNNClassifier_VM.pkl",'wb') as KNN_file:
    pickle.dump(knn_class,KNN_file)

# *Recommentation*

In [None]:
#Collaborative Filtering

In [None]:
#user related recommentation

coll_rec = main_data[['UserId','CityId','Rating','CityName','Country']]

In [None]:
# Load into Surprise format
reader = Reader(rating_scale=(1, 5))
data_surprise = Dataset.load_from_df(coll_rec[['UserId','CityId','Rating']], reader)
train_data=data_surprise.build_full_trainset()

model = SVD()
model.fit(train_data)

model_results = cross_validate(model,data_surprise,measures=['RMSE','MAE'],cv=5,verbose=True)


In [None]:
with open("SVD.pkl",'wb') as SVD_file:
    pickle.dump(model,SVD_file)

In [None]:
def recommend_svd(user_id, n=10):
    all_cities = coll_rec['CityId'].unique()
    rated = coll_rec[coll_rec['UserId'] == user_id]['CityId'].tolist()
    unrated = [city for city in all_cities if city not in rated]

    preds = [model.predict(user_id, cid) for cid in unrated]
    preds.sort(key=lambda x: x.est, reverse=True)

    top_cities = [pred.iid for pred in preds[:n]]
    return coll_rec[coll_rec['CityId'].isin(top_cities)][['CityName', 'Country']].drop_duplicates()


In [None]:
#Content-Based Filtering

In [None]:
content_rec=main_data[['CityName','VisitMode','AttractionType','Attraction','Country']].value_counts().reset_index()

In [None]:
content_rec['rec_data'] =  content_rec['AttractionType'] + ' ' + content_rec['Attraction'] + ' ' + content_rec['Country']

In [None]:
#Vectorizer

tfid = TfidfVectorizer()

x_rec = tfid.fit_transform(content_rec['rec_data'] )

In [None]:
#cosine_similarity

co_sim = cosine_similarity(x_rec,x_rec)

In [None]:
def recommend_place(AttractionType):
  idx = content_rec[content_rec['AttractionType']==AttractionType].index
  idx=idx[0]
  sim = list(enumerate(co_sim[idx]))
  sim = sorted(sim,key=lambda x:x[1],reverse=True)[:50]
  att=[i[0] for i in sim]
  return content_rec.iloc[att][['CityName','VisitMode','Country']]

# **Streamlit**

In [None]:
!pip install streamlit
!pip install pyngrok

In [25]:
%%writefile app.py
import streamlit as st
import pandas as pd
import pickle
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

col1, col2 = st.columns([0.7,0.7])

with col1:
    st.image("/content/drive/MyDrive/mini_project_4_db/m4_pickle_files/Tourism template.jpg",width=600)
with col2:
    st.image("/content/drive/MyDrive/mini_project_4_db/m4_pickle_files/images_temp.jpg",width=600)

st.title('🤖Smart Travel Recommender✨')
data=pd.read_csv('/content/drive/MyDrive/mini_project_4_db/m4_pickle_files/data.csv')
fil_data = data[['Continent','Region','Country','Attraction','VisitMonth','VisitMode','CityName','Rating']]
main_menu=fil_data.groupby(['Continent','Region','Country','Attraction','VisitMonth','VisitMode','CityName'])['Rating'].mean().reset_index()
main_menu.drop('Region',axis=1,inplace=True)

st.write("Discover your next dream destination! Our smart system recommends the best tourist spots based on your travel style, location, and ratings.")
st.subheader("How It Works")
st.markdown("""
1. 🎯 Select your preferences (Country, City, Mode of Visit).
2. 🌍 View top-rated tourist places.
3. 🧳 Save your favorite destinations and start planning!
""")
st.subheader("Key Features")
st.markdown("""
- 🚀 Real-time destination suggestions.
- 🏖️ Personalized based on your choices.
- 📊 Data-driven recommendations.
- 🌟 Explore attractions by ratings, region, and season.
""")

st.sidebar.header("Recommender🌍")
options=st.sidebar.selectbox('',['Select_Type','Collaborative Filtering','Content-Based'])

if options=='Collaborative Filtering':
  #user related recommentation
  coll_rec = data[['UserId','CityId','Rating','CityName','Country']]

  UserId = st.selectbox('UserId',data['UserId'].unique())

# Load into Surprise format
  reader = Reader(rating_scale=(1, 5))
  data_surprise = Dataset.load_from_df(coll_rec[['UserId','CityId','Rating']], reader)
  train_data=data_surprise.build_full_trainset()

  with open('/content/drive/MyDrive/mini_project_4_db/m4_pickle_files/SVD.pkl','rb') as model:
    svd_model=pickle.load(model)


  def recommend_svd(user_id, n=10):
    all_cities = coll_rec['CityId'].unique()
    rated = coll_rec[coll_rec['UserId'] == user_id]['CityId'].tolist()
    unrated = [city for city in all_cities if city not in rated]

    preds = [svd_model.predict(user_id, cid) for cid in unrated]
    preds.sort(key=lambda x: x.est, reverse=True)

    top_cities = [pred.iid for pred in preds[:n]]
    return coll_rec[coll_rec['CityId'].isin(top_cities)][['CityName', 'Country']].drop_duplicates()
  predict_coll=st.button('Predict')
  if predict_coll:
    coll_output = recommend_svd(UserId)
    st.header("Recommended Places")
    st.write(coll_output)

if options=='Content-Based':
    content_rec=data[['CityName','VisitMode','AttractionType','Attraction','Country']].value_counts().reset_index()
    content_rec['rec_data'] =  content_rec['AttractionType'] + ' ' + content_rec['Attraction'] + ' ' + content_rec['Country']
    from sklearn.feature_extraction.text import TfidfVectorizer

    tfid = TfidfVectorizer()

    x_rec = tfid.fit_transform(content_rec['rec_data'] )
    from sklearn.metrics.pairwise import cosine_similarity

    co_sim = cosine_similarity(x_rec,x_rec)
    def recommend_place(AttractionType):
        idx = content_rec[content_rec['AttractionType']==AttractionType].index
        idx=idx[0]
        sim = list(enumerate(co_sim[idx]))
        sim = sorted(sim,key=lambda x:x[1],reverse=True)[:50]
        att=[i[0] for i in sim]
        return content_rec.iloc[att][['CityName','VisitMode','Country']]
    att=data['AttractionType'].unique()
    att_input=st.selectbox('SelectAttractionType',att)
    predict_content = st.button('Predict')
    if predict_content:
      att_output=recommend_place(att_input)
      st.header("Recommended Places")
      st.write(att_output)

st.sidebar.header("Tourist Place Directory")

continent_list = main_menu['Continent'].unique()
country_list = main_menu['Country'].unique()
country_list.sort()
attraction_list = main_menu['Attraction'].unique()
visitmonth_list = main_menu['VisitMonth'].unique()
visitmonth_list.sort()
visitmode_list = main_menu['VisitMode'].unique()


country = st.sidebar.selectbox("Country",country_list)
visitmonth = st.sidebar.selectbox("VisitMonth",visitmonth_list)
visitmode = st.sidebar.selectbox("VisitMode",visitmode_list)

show = st.sidebar.button('Show Places')
if show:
    filtered_data = main_menu[(main_menu['Country']==country) & (main_menu['VisitMonth']==visitmonth) & (main_menu['VisitMode']==visitmode)]
    st.header("Recommended Places")
    st.write(filtered_data)


Overwriting app.py


In [None]:
!npm install localtunnel

In [None]:
!streamlit run /content/app.py &>/content/logs.txt & npx localtunnel --port 8501 & curl ipv4.icanhazip.com