In [None]:
# Import libraries for data analysis and visualization (pandas, numpy, matplotlib, seaborn)
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load tourism dataset from CSV file into a Pandas DataFrame
Tourism_df = pd.read_csv(r"D:\TRANSACTION PROJECT\Full Tourism Data.csv")

In [None]:
# Display the first two rows of the tourism dataset
Tourism_df.head(2)

In [None]:
# Count the number of duplicate rows in the tourism dataset
Tourism_df.duplicated().sum()

In [None]:
# Check for missing values in the tourism dataset
Tourism_df.isna().sum()

In [None]:
# Apply SMOTE to balance the dataset after encoding categorical features
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd

# Step 1: Drop Unnecessary Columns & Separate Target Variable
X = Tourism_df.drop(columns=['TransactionId', 'Rating', 'ContenentId', 'RegionId', 
                             'CountryId', 'CityId', 'AttractionAddress', 
                             'AttractionTypeId', 'VisitModeName'])
y = Tourism_df['VisitModeName']

# Step 2: Apply One-Hot Encoding to Categorical Features
X = pd.get_dummies(X, drop_first=True)  # Converts categorical to one-hot encoded

# Step 3: Label Encode the Target Variable (y)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Step 4: Apply SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Step 5: Convert y_smote Back to DataFrame and Merge
y_smote = pd.DataFrame(y_smote, columns=['VisitModeName'])
balanced_tourism_data = pd.concat([X_smote, y_smote], axis=1)

# Step 6: Print Class Distribution Before & After SMOTE
print("Before SMOTE:")
print(pd.Series(y).value_counts())

print("\nAfter SMOTE:")
print(y_smote['VisitModeName'].value_counts())


In [None]:
# Display the first few rows of the balanced tourism dataset
balanced_tourism_data.head()

In [None]:
# Import libraries for data preprocessing, model training, and evaluation (train_test_split, OneHotEncoder, StandardScaler, DecisionTreeClassifier, and classification metrics)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [None]:
# Train and evaluate Decision Tree and Random Forest classifiers on balanced data
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 1: Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42, stratify=y_smote)

# Step 2: Train Decision Tree Model
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

# Step 3: Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

# Step 4: Make Predictions
dt_pred = dt_model.predict(X_test)
rf_pred = rf_model.predict(X_test)

# Step 5: Evaluate Models
def evaluate_model(model_name, y_true, y_pred):
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_true, y_pred, average='weighted'):.4f}")

evaluate_model("Decision Tree", y_test, dt_pred)
evaluate_model("Random Forest", y_test, rf_pred)


In [91]:
# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

In [92]:
# Train Decision Tree Model
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

# Predict Ratings
dt_pred = dt_model.predict(X_test)

In [None]:
print(dt_pred)

In [None]:
# Accuracy
accuracy = accuracy_score(y_test, dt_pred)

# Precision & Recall (For Multi-Class Classification, Use Average='weighted')
precision = precision_score(y_test, dt_pred, average='weighted')
recall = recall_score(y_test, dt_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")



In [43]:
import joblib

In [None]:
joblib.dump(dt_model, r"D:\TRANSACTION PROJECT\dc_model.pkl")

In [95]:
from sklearn.ensemble import RandomForestClassifier



In [None]:
# Train Random Forest Model (same approach)
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

# Predict using Random Forest
rf_pred = rf_model.predict(X_test)


In [None]:
rf_pred

In [None]:
# Accuracy
accuracy = accuracy_score(y_test, rf_pred)

# Precision & Recall (For Multi-Class Classification, Use Average='weighted')
precision = precision_score(y_test, rf_pred, average='weighted')
recall = recall_score(y_test, rf_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

In [99]:
import xgboost

In [100]:
from xgboost import XGBClassifier

# Step 1: Initialize XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=50, max_depth=3, learning_rate=0.2, random_state=42)

# Step 2: Train the Model
xgb_model.fit(X_train, y_train)

# Step 3: Make Predictions
xgb_pred = xgb_model.predict(X_test)

In [None]:
# Step 4: Evaluate the Model
def evaluate_model(model_name, y_true, y_pred):
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_true, y_pred, average='weighted'):.4f}")

evaluate_model("XGBoost", y_test, xgb_pred)