In [13]:
# --- CELL 1: IMPORT LIBRARIES ---
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

# Advanced Machine Learning Libraries
from sklearn.feature_extraction.text import TfidfVectorizer # For understanding text descriptions
from sklearn.metrics.pairwise import cosine_similarity # For finding similarities between cities
from sklearn.cluster import KMeans # For grouping cities by budget (Budget vs Luxury)
from sklearn.preprocessing import StandardScaler # To scale prices for fair comparison

print("‚úÖ Libraries Imported Successfully!")

‚úÖ Libraries Imported Successfully!


In [14]:
# --- CELL 2: LOAD RAW DATASETS ---
print("üöÄ Loading Data...")

# Load all 5 CSVs uploaded by the user
df_city = pd.read_csv('City.csv')
df_hotel = pd.read_csv('Hotel.csv')
df_food = pd.read_csv('Food.csv')
df_places = pd.read_csv('Places.csv')
df_transport = pd.read_csv('Transport.csv')

print("‚úÖ Data Loaded Successfully!")
print(f"Cities: {df_city.shape}, Hotels: {df_hotel.shape}, Food: {df_food.shape}")
print(f"Places: {df_places.shape}, Transport: {df_transport.shape}")

üöÄ Loading Data...
‚úÖ Data Loaded Successfully!
Cities: (534, 8), Hotels: (1102, 13), Food: (1000, 4)
Places: (381, 18), Transport: (756, 4)


In [15]:
# --- CELL 3: DATA CLEANING & STANDARDIZATION ---
print("üßπ Starting Data Cleaning Process...")

# Helper function to clean string columns (Removes spaces, Title Case)
def clean_text(text):
    if isinstance(text, str):
        return text.strip().title()
    return text

# 1. Clean Column Names (Remove spaces like ' City ' -> 'City')
for df in [df_city, df_hotel, df_food, df_places, df_transport]:
    df.columns = df.columns.str.strip()

# 2. Clean String Data in Key Columns (Apply the function)
df_city['City'] = df_city['City'].apply(clean_text)
df_city['State'] = df_city['State'].apply(clean_text)

df_hotel['City'] = df_hotel['City'].apply(clean_text)
df_hotel['Hotel_Name'] = df_hotel['Hotel_Name'].apply(clean_text)

df_food['State'] = df_food['State'].apply(clean_text)

df_places['City'] = df_places['City'].apply(clean_text)
df_places['State'] = df_places['State'].apply(clean_text)

df_transport['From_State'] = df_transport['From_State'].apply(clean_text)
df_transport['To_State'] = df_transport['To_State'].apply(clean_text)

# 3. Remove Duplicates (Strict Internship Rule)
df_city.drop_duplicates(inplace=True)
df_hotel.drop_duplicates(inplace=True)
df_food.drop_duplicates(inplace=True)
df_places.drop_duplicates(inplace=True)
df_transport.drop_duplicates(inplace=True)

print("‚úÖ Data Cleaning & Deduplication Complete.")

üßπ Starting Data Cleaning Process...
‚úÖ Data Cleaning & Deduplication Complete.


In [16]:
# --- CELL 4: HANDLING MISSING VALUES ---

# 1. Hotels: Fill missing prices with the median price of that specific city
# If the city has no other hotels, use the global median.
df_hotel['Hotel_Price'] = df_hotel['Hotel_Price'].fillna(df_hotel['Hotel_Price'].median())

# 2. Places: Fill missing ratings with a neutral 4.0
df_places['Google review rating'] = df_places['Google review rating'].fillna(4.0)

# 3. City: Fill missing descriptions
df_city['City_desc'] = df_city['City_desc'].fillna("Beautiful destination to visit.")

print("‚úÖ Missing Values Handled.")

‚úÖ Missing Values Handled.


In [17]:
# --- CELL 5: TRAINING THE RECOMMENDATION ENGINE (CONTENT-BASED) ---
print("üß† Training NLP Recommendation Model...")

# Initialize TF-IDF Vectorizer (Stop words removes 'the', 'is', 'and')
tfidf = TfidfVectorizer(stop_words='english')

# Fit and Transform the City Descriptions
tfidf_matrix = tfidf.fit_transform(df_city['City_desc'])

# Compute Cosine Similarity (The Angle between vectors = Similarity)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping of City Name to Index (for easy lookup later)
indices = pd.Series(df_city.index, index=df_city['City']).drop_duplicates()

print("‚úÖ Similarity Model Trained.")

üß† Training NLP Recommendation Model...
‚úÖ Similarity Model Trained.


In [18]:
# --- CELL 6: UNSUPERVISED LEARNING (K-MEANS CLUSTERING) ---
print("üìä Performing K-Means Clustering on Budget...")

# 1. Aggregate Hotel Prices by City (Average cost to stay in each city)
city_prices = df_hotel.groupby('City')['Hotel_Price'].mean().reset_index()

# 2. Merge this price info into the main City dataset
df_city_ml = pd.merge(df_city, city_prices, on='City', how='left')
df_city_ml['Hotel_Price'] = df_city_ml['Hotel_Price'].fillna(df_city_ml['Hotel_Price'].median())

# 3. Prepare Data for Clustering
X = df_city_ml[['Hotel_Price']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. Apply K-Means (3 Clusters: Low, Medium, High Cost)
kmeans = KMeans(n_clusters=3, random_state=42)
df_city_ml['Budget_Cluster'] = kmeans.fit_predict(X_scaled)

# Label the clusters (0, 1, 2 might be random, so we sort them by price to name them correctly)
cluster_mapping = df_city_ml.groupby('Budget_Cluster')['Hotel_Price'].mean().sort_values().index
label_map = {cluster_mapping[0]: 'Budget Friendly', cluster_mapping[1]: 'Standard', cluster_mapping[2]: 'Luxury'}
df_city_ml['Budget_Category'] = df_city_ml['Budget_Cluster'].map(label_map)

print("‚úÖ Clustering Complete. Cities categorized by budget.")

üìä Performing K-Means Clustering on Budget...
‚úÖ Clustering Complete. Cities categorized by budget.


In [19]:
# 1. Load Data (Using loose matching for filenames)
try:
    df_city = pd.read_csv('City.csv')
    df_hotel = pd.read_csv('Hotel.csv')
    df_food = pd.read_csv('Food.csv')
    df_places = pd.read_csv('Places.csv')
    print("‚úÖ Raw files loaded.")
except FileNotFoundError as e:
    print(f"‚ùå ERROR: Could not find file. Make sure City.csv, Hotel.csv, etc. are in the folder. Details: {e}")

# 2. Clean Column Names (Strip spaces)
for df in [df_city, df_hotel, df_food, df_places]:
    df.columns = df.columns.str.strip()

# 3. Force Numeric Prices (Fixes "String" errors)
df_hotel['Hotel_Price'] = pd.to_numeric(df_hotel['Hotel_Price'], errors='coerce').fillna(2000)
df_food['Price'] = pd.to_numeric(df_food['Price'], errors='coerce').fillna(150)

# 4. Standardize Text
df_city['City'] = df_city['City'].str.strip().str.title()
df_city['State'] = df_city['State'].str.strip().str.title()
df_hotel['City'] = df_hotel['City'].str.strip().str.title()
df_places['City'] = df_places['City'].str.strip().str.title()

# 5. Advanced ML: Generate Similarity Matrix
print("üß† Retraining ML Model...")
tfidf = TfidfVectorizer(stop_words='english')
df_city['City_desc'] = df_city['City_desc'].fillna('')
tfidf_matrix = tfidf.fit_transform(df_city['City_desc'])
similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 6. Save Clean Files (The Bridge)
print("üíæ Saving Clean Files...")
df_city.to_csv('clean_city.csv', index=False)
df_hotel.to_csv('clean_hotel.csv', index=False)
df_food.to_csv('clean_food.csv', index=False)
df_places.to_csv('clean_places.csv', index=False)

with open('similarity.pkl', 'wb') as f:
    pickle.dump(similarity, f)

print("üéâ FIX COMPLETE. Now run app.py")

‚úÖ Raw files loaded.
üß† Retraining ML Model...
üíæ Saving Clean Files...
üéâ FIX COMPLETE. Now run app.py


In [20]:
# --- RUN THIS TO ADD TRANSPORT DATA ---
import pandas as pd
try:
    df_trans = pd.read_csv('Transport.csv')
    df_trans.columns = df_trans.columns.str.strip()
    df_trans['From_State'] = df_trans['From_State'].str.strip().str.title()
    df_trans['To_State'] = df_trans['To_State'].str.strip().str.title()
    df_trans.to_csv('clean_transport.csv', index=False)
    print("‚úÖ Transport Data Ready.")
except:
    print("‚ö†Ô∏è Transport.csv not found, proceeding without it.")

‚úÖ Transport Data Ready.
