In [6]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# ------------------------------
# Load and Preprocess Dataset
# ------------------------------
df = pd.read_csv('tourism_dataset.csv')
df.drop(columns=['Location'], inplace=True, errors='ignore')

# Keep only relevant columns: Rating and Revenue
df = df[['Rating', 'Revenue', 'Visitors']]

# Log-transform visitors and revenue
df['Revenue'] = np.log1p(df['Revenue'])
df['Visitors'] = np.log1p(df['Visitors'])

# Define features and target
X = df[['Rating', 'Revenue']]
y = df['Visitors']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features and target
scaler_X = MinMaxScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

scaler_y = MinMaxScaler()
y_train_std = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_std = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()

# ------------------------------
# Train Models
# ------------------------------

# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train_std)
y_pred_lin = lin_reg.predict(X_test)
mse_lin = mean_squared_error(y_test_std, y_pred_lin)
print(f"Linear Regression MSE: {mse_lin}")

# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train_std)
y_pred_dt = dt.predict(X_test)
mse_dt = mean_squared_error(y_test_std, y_pred_dt)
print(f"Decision Tree MSE: {mse_dt}")

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train_std)
y_pred_rf = rf.predict(X_test)
mse_rf = mean_squared_error(y_test_std, y_pred_rf)
print(f"Random Forest MSE: {mse_rf}")

# ------------------------------
# Compare and Save Best Model
# ------------------------------
mse_scores = {
    "Linear Regression": mse_lin,
    "Decision Tree": mse_dt,
    "Random Forest": mse_rf
}

best_model_name = min(mse_scores, key=mse_scores.get)
best_model = {"Linear Regression": lin_reg, "Decision Tree": dt, "Random Forest": rf}[best_model_name]

print(f"Best Model: {best_model_name} with MSE: {mse_scores[best_model_name]:.6f}")


Linear Regression MSE: 0.023743330450740754
Decision Tree MSE: 0.04532157486247677
Random Forest MSE: 0.027552087377501616
Best Model: Linear Regression with MSE: 0.023743


In [51]:
import joblib

# Save the best model
joblib.dump(best_model, 'best_model.pkl')
joblib.dump(scaler_X, 'scaler_X.pkl')
joblib.dump(scaler_y, 'scaler_y.pkl')



print("Models saved successfully!")


Models saved successfully!


In [52]:
from google.colab import files

files.download("best_model.pkl")
files.download("scaler_X.pkl")
files.download("scaler_y.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>