In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

# Load dataset
train = pd.read_csv(r"C:\Users\Ayush Pandita\OneDrive\Desktop\YR 3 SEM 6\AOML\ASS3\data2\train.csv")
test = pd.read_csv(r"C:\Users\Ayush Pandita\OneDrive\Desktop\YR 3 SEM 6\AOML\ASS3\data2\test.csv")

# Sample a smaller subset for memory efficiency
train_sample = train.sample(n=min(30000, len(train)), random_state=42)

# Target variable
y = train_sample['price']
X = train_sample.drop(columns=['id', 'price'])
test_ids = test['id']
test = test.drop(columns=['id'])

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Label Encoding for categorical variables (Ensuring consistency)
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    test[col] = le.transform(test[col].astype(str))  # Use same encoder for test data

# Handling missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
test = pd.DataFrame(imputer.transform(test), columns=test.columns)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test)

# Train-validation split without stratification
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Baseline Model (Random Forest)
rf_model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42, n_jobs=2)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict_proba(X_valid)

# Compute AUC score properly
print("Random Forest AUC:", roc_auc_score(pd.get_dummies(y_valid), rf_preds, multi_class='ovr'))

# Hyperparameter tuning (Randomized Search)
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

rf_random = RandomizedSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=2),
    param_distributions=param_dist,
    n_iter=5,
    cv=3,
    scoring='roc_auc',
    random_state=42,
    n_jobs=2
)

rf_random.fit(X_train, y_train)
print("Best parameters (RF):", rf_random.best_params_)

# Train XGBoost with optimized parameters
xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42,
    n_jobs=2
)

xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict_proba(X_valid)

# Compute AUC score for XGBoost
print("XGBoost AUC:", roc_auc_score(pd.get_dummies(y_valid), xgb_preds, multi_class='ovr'))

# Predict on test set
final_preds = xgb_model.predict_proba(test_scaled)[:, 1]
submission = pd.DataFrame({"id": test_ids, "target": final_preds})
submission.to_csv("submission.csv", index=False)
print("Submission file saved!")


IndexError: index 3399 is out of bounds for axis 1 with size 3399