In [40]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib

In [41]:
try:
    df = pd.read_csv("trek_dataset_with_current_weather.csv")
except FileNotFoundError:
    print("Error: Dataset file 'trek_dataset_with_current_weather.csv' not found. Please provide the correct path.")
    exit()

In [42]:
df = df.drop(['link_AllTrails', 'image'], axis=1, errors='ignore')

In [43]:
df['Length (in km)'] = pd.to_numeric(
    df['Length'].astype(str).str.extract(r'(\d+\.?\d*)')[0],
    errors='coerce'
)

In [44]:

df['Est_time'] = pd.to_numeric(df['Est_time'], errors='coerce').fillna(df['Est_time'].mean())
df['number_of_reviews'] = df['number_of_reviews'].abs()
df['Average_rating'] = df['Average_rating'].fillna(df['Average_rating'].mean())

In [45]:
weather_cols = ['latitude', 'longitude', 'current_temperature', 'current_windspeed',
                'current_winddirection', 'current_weather_code']
for col in weather_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(df[col].mean())

In [46]:
df[['City', 'State', 'Country']] = df['Location'].str.split(',', expand=True)
df['City'] = df['City'].str.strip()
df['State'] = df['State'].str.strip()
df['Country'] = df['Country'].str.strip()
df = df.drop('Location', axis=1)
season_mapping = {
    'Himachal Pradesh': 'April - June, September - November',
    'Uttarakhand': 'March - June, September - November',
    'Maharashtra': 'October - February',
    'Karnataka': 'October - February',
    'Kerala': 'September - March',
    'Jammu and Kashmir': 'May - October',
    'West Bengal': 'October - March',
    'Tamil Nadu': 'November - February',
    'Goa': 'November - February'
}
df['Best_Season'] = df['State'].map(season_mapping).fillna('All Year')

In [47]:
# Convert Tags to list for MultiLabelBinarizer
df['tags_list'] = df['Tags'].str.lower().str.split(', ').apply(lambda x: [tag.strip() for tag in x])

In [None]:
# Prepare features
# Fit and transform tags correctly
mlb = MultiLabelBinarizer()
mlb.fit(df['tags_list'])  # Fit only
tags_encoded = mlb.transform(df['tags_list'])  # Then transform

# Now convert to DataFrame
tags_df = pd.DataFrame(tags_encoded, columns=mlb.classes_)

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = ohe.fit_transform(df[['Difficulty', 'Best_Season', 'State']])
cat_df = pd.DataFrame(cat_encoded, columns=ohe.get_feature_names_out(['Difficulty', 'Best_Season', 'State']))
numerical_cols = ['Length (in km)', 'current_windspeed', 'number_of_reviews',
                  'Est_time', 'current_temperature', 'current_weather_code']
X = pd.concat([df[numerical_cols], cat_df, tags_df], axis=1)
y = df['Trail_name']

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
trail_classifier = RandomForestClassifier(
    n_estimators=100, 
    random_state=42, 
    class_weight='balanced'
)
trail_classifier.fit(X_train, y_train)


In [51]:
y_pred = trail_classifier.predict(X_test)

In [52]:
print(f"Model trained. Accuracy: {accuracy_score(y_test, y_pred):.4f}")


Model trained. Accuracy: 0.1714


In [54]:
columns = X.columns.tolist()

In [55]:
import joblib

In [56]:
joblib.dump(trail_classifier, 'model.pkl')
joblib.dump(ohe, 'ohe.pkl')
joblib.dump(mlb, 'mlb.pkl')
joblib.dump(columns, 'columns.pkl')
df.to_csv("processed_trek_data.csv", index=False)
print("✅ Model, encoders, and processed data saved.")

✅ Model, encoders, and processed data saved.


In [57]:
def preprocess_user_input(user_input):
    input_data = pd.DataFrame({
        'Difficulty': [user_input['Difficulty']],
        'Best_Season': [user_input['Best_Season']],
        'State': [user_input['State']],
        'Length (in km)': [user_input['Length']],
        'current_windspeed': [user_input['windspeed']],
        'current_temperature': [user_input['temperature']],
        'number_of_reviews': [df['number_of_reviews'].mean()],
        'Est_time': [df['Est_time'].mean()],
        'current_weather_code': [df['current_weather_code'].mean()]
    })

    try:
        encoded_cat = pd.DataFrame(
            ohe.transform(input_data[['Difficulty', 'Best_Season', 'State']]),
            columns=ohe.get_feature_names_out(['Difficulty', 'Best_Season', 'State'])
        )
    except Exception as e:
        return None, f"Encoding error: {str(e)}"

    tags_encoded = pd.DataFrame(
        mlb.transform([user_input['tags_list']]),
        columns=mlb.classes_
    )

    X_input = pd.concat([
        input_data[['Length (in km)', 'current_windspeed', 'current_temperature',
                    'number_of_reviews', 'Est_time', 'current_weather_code']],
        encoded_cat, tags_encoded
    ], axis=1)

    for col in columns:
        if col not in X_input.columns:
            X_input[col] = 0

    X_input = X_input[columns]
    return X_input, None


In [58]:
def predict_trek(state, difficulty, length, temperature, windspeed, tags, season=None):
    user_input = {
        'State': state,
        'Best_Season': season_mapping.get(state, 'All Year') if not season else season,
        'Difficulty': difficulty,
        'tags_list': tags if tags else ['hiking'],
        'Length': length,
        'windspeed': windspeed,
        'temperature': temperature
    }

    X_input, error = preprocess_user_input(user_input)
    if error:
        return {"error": error}

    try:
        proba = model.predict_proba(X_input)[0]
        trail_names = model.classes_
        top_indices = np.argsort(proba)[-20:][::-1]  # Top 20 candidates

        closest_match = None
        closest_score = float('inf')
        best_proba = 0
        warnings = []

        for idx in top_indices:
            trail = trail_names[idx]
            row = df[df['Trail_name'] == trail]

            if row.empty:
                continue
            row = row.iloc[0]

            if row['State'] != state:
                continue
            if row['Difficulty'] != difficulty:
                continue

            # Scoring by similarity
            length_diff = abs(row['Length (in km)'] - length)
            wind_diff = abs(row['current_windspeed'] - windspeed)
            score = length_diff + 0.5 * wind_diff

            if score < closest_score:
                closest_score = score
                closest_match = row
                best_proba = proba[idx]

        if closest_match is not None:
            # Mismatch warnings
            if abs(closest_match['Length (in km)'] - length) > 5:
                warnings.append("⚠️ Trail length differs by over 5 km from your input.")
            if abs(closest_match['current_windspeed'] - windspeed) > 10:
                warnings.append("⚠️ Wind speed differs significantly from your input.")
            if closest_match['Difficulty'] != difficulty:
                warnings.append(f"⚠️ Trail difficulty is {closest_match['Difficulty']} but you chose {difficulty}.")

            return {
                "trail_name": closest_match['Trail_name'],
                "difficulty": closest_match['Difficulty'],
                "length_km": closest_match['Length (in km)'],
                "best_season": closest_match['Best_Season'],
                "state": closest_match['State'],
                "tags": closest_match['Tags'],
                "windspeed": closest_match['current_windspeed'],
                "temperature": closest_match['current_temperature'],
                "description": closest_match['description'],
                "confidence": round(best_proba * 100, 2),
                "warnings": warnings
            }

        return {"error": f"No suitable trek found in {state}. Try different inputs."}
    except Exception as e:
        return {"error": f"Prediction failed: {str(e)}"}
