In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 1. Load the data
df = pd.read_csv("DATA/Travel.csv")

# 2. Rename the target column for convenience
df.rename(columns={"What is your mode of transportation?": "mode"}, inplace=True)

# 3. Drop rows with missing values (or handle them as needed)
df = df.dropna()

# 4. Separate the target and features
X = df.drop("mode", axis=1)
y = df["mode"]

# 5. Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

# 6. Create preprocessing pipelines for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# 7. Build the model pipeline with a Random Forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(random_state=42))
])

# 8. Define hyperparameter grid for tuning
param_grid = {
    "clf__n_estimators": [50, 100, 200],
    "clf__max_depth": [None, 10, 20],
    "clf__min_samples_split": [2, 5, 10]
}

# 9. Set up GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1)

# 10. Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 11. Fit the model using grid search
grid_search.fit(X_train, y_train)

# 12. Output the best parameters and performance metrics
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))
test_predictions = grid_search.predict(X_test)
print("Test set accuracy: {:.2f}".format(accuracy_score(y_test, test_predictions)))




Best parameters: {'clf__max_depth': None, 'clf__min_samples_split': 5, 'clf__n_estimators': 50}
Best cross-validation accuracy: 0.52
Test set accuracy: 0.50


In [2]:
print(df.columns)

Index(['Timestamp', 'Are you from Jaipur?', 'What is your age?',
       'What is your gender?',
       'What is your departure address (with pincode)?',
       'What is your destination address?  (with pincode)',
       'What is your mode of transportation?',
       'If Public Transport, Do you get seat availability easily?  ',
       'If personal vehicles do you find a parking space easily?',
       'How long do you wait for Public Transport?',
       'What time do you prefer to leave your house to work/school/university/hospital?',
       'How much time (minutes) do you take to travel ?',
       'What time do you prefer to leave your work/school/university/ hospital for home?',
       'How long time (minutes) do you travel?', 'Cost of petrol',
       'How many toll booth are there from your house to work/school/university/ hospital?',
       'Could you kindly specify the tolls?',
       'Could you kindly tell your normally driving mood during driving time?',
       'How many stops du