In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [70]:
# --- Load data ---
train = pd.read_csv(r'C:/Users/sssso/Downloads/Hazard_train.csv')
test = pd.read_csv(r'C:/Users/sssso/Downloads/Hazard_test_share.csv')

In [73]:
train['data'] = 'train'
test['data'] = 'test'
test['Hazard'] = np.nan

In [75]:
 # Combine
all_data = pd.concat([train, test], axis=0, sort=False)


In [77]:
# Encode categories except 'data'
cat_cols = all_data.select_dtypes(include=['object']).columns
for col in cat_cols:
    if col != 'data':
        all_data[col] = all_data[col].astype('category').cat.codes

In [79]:
print(all_data['data'].value_counts())


data
train    40799
test     10200
Name: count, dtype: int64


In [81]:
# Split
train_data = all_data[all_data['data'] == 'train']
test_data = all_data[all_data['data'] == 'test']

x_train = train_data.drop(['Id', 'Hazard', 'data'], axis=1)
y_train = train_data['Hazard'].astype(float)
x_test = test_data.drop(['Id', 'Hazard', 'data'], axis=1)

In [83]:
print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")

x_train shape: (40799, 32), y_train shape: (40799,)
x_test shape: (10200, 32)


In [85]:
# --- Proceed only if x_train has rows ---
if x_train.shape[0] > 0:
    # --- Lasso ---
    lasso_params = {'alpha': np.linspace(0.01, 10, 100)}
    lasso_model = Lasso()

    lasso_grid = GridSearchCV(
        lasso_model,
        param_grid=lasso_params,
        scoring='neg_mean_absolute_error',
        cv=5,
        n_jobs=-1,
        verbose=2
    )
    lasso_grid.fit(x_train, y_train)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [87]:
print("Best Lasso alpha:", lasso_grid.best_params_)
print("Best Lasso MAE on CV:", -lasso_grid.best_score_)

Best Lasso alpha: {'alpha': 0.01}
Best Lasso MAE on CV: 2.7855869467218715


In [91]:
 # --- RandomForest ---
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(x_train, y_train)

In [93]:
 #Optional: validation performance
X_tr, X_val, Y_tr, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
rf_model.fit(X_tr, Y_tr)
val_preds = rf_model.predict(X_val)
print("Random Forest MAE on validation split:", mean_absolute_error(Y_val, val_preds))


Random Forest MAE on validation split: 2.7641291759767617


In [95]:
# --- Predict test ---
test_preds_rf = rf_model.predict(x_test)

In [97]:
# --- Save submission ---
submission = pd.DataFrame({
        'Id': test['Id'],
        'Hazard': test_preds_rf
    })
submission.to_csv(r'C:/Users/sssso/Downloads/project4_submission.csv', index=False)
print("Saved: project4_submission.csv")

Saved: project4_submission.csv


In [99]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# --- Load data ---
bd_train = pd.read_csv(r'E:/IITK/P4 Data/Hazard_train.csv')
bd_test = pd.read_csv(r'E:/IITK/P4 Data/Hazard_test_share.csv')

# --- Identify categorical columns ---
cat_cols = bd_train.select_dtypes(['object']).columns

# --- Mark train/test, combine ---
bd_train['data'] = 'train'
bd_test['data'] = 'test'
bd_test['Hazard'] = np.nan
all_data = pd.concat([bd_train, bd_test], axis=0, sort=False)

# --- Encode categorical variables with frequency filtering ---
for col in cat_cols:
    k = all_data[col].value_counts()
    cats = k[k >= 100].index[:-1]  # all except most common
    for cat in cats:
        name = f"{col}_{cat}"
        all_data[name] = (all_data[col] == cat).astype(int)
    del all_data[col]

# --- Prepare train and test sets ---
x_train = all_data.drop(['Id', 'Hazard', 'data'], axis=1)[all_data['data'] == 'train']
y_train = all_data['Hazard'][all_data['data'] == 'train']
x_test = all_data.drop(['Id', 'Hazard', 'data'], axis=1)[all_data['data'] == 'test']

print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")

# --- Lasso model with grid search ---
params = {'alpha': np.linspace(0.01, 10, 100)}
model = Lasso(fit_intercept=True)

grid_search = GridSearchCV(
    model,
    param_grid=params,
    scoring='neg_mean_absolute_error',
    cv=10,
    n_jobs=-1,
    verbose=10
)

grid_search.fit(x_train, y_train)

print("Best Lasso alpha:", grid_search.best_params_)
print("Best Lasso MAE on CV:", abs(grid_search.best_score_))

# --- Predict and save submission ---
submissions = pd.DataFrame({
    'Id': bd_test['Id'],
    'Hazard': grid_search.predict(x_test)
})

submissions.to_csv(r'E:/IITK/P4 Data/benchmark_submission.csv', index=False)
print("Saved: benchmark_submission.csv")


x_train shape: (40799, 91), y_train shape: (40799,)
x_test shape: (10200, 91)
Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best Lasso alpha: {'alpha': 0.01}
Best Lasso MAE on CV: 2.748947530559763
Saved: benchmark_submission.csv


In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# --- Load data ---
bd_train = pd.read_csv(r'E:/IITK/P4 Data/Hazard_train.csv')
bd_test = pd.read_csv(r'E:/IITK/P4 Data/Hazard_test_share.csv')

# --- Identify categorical columns ---
cat_cols = bd_train.select_dtypes(['object']).columns

# --- Mark train/test, combine ---
bd_train['data'] = 'train'
bd_test['data'] = 'test'
bd_test['Hazard'] = np.nan
all_data = pd.concat([bd_train, bd_test], axis=0, sort=False)

# --- Encode categorical features with frequency filter ---
for col in cat_cols:
    k = all_data[col].value_counts()
    cats = k[k >= 100].index[:-1]  # exclude most frequent
    for cat in cats:
        name = f"{col}_{cat}"
        all_data[name] = (all_data[col] == cat).astype(int)
    del all_data[col]

# --- Split train/test ---
x_train = all_data.drop(['Id', 'Hazard', 'data'], axis=1)[all_data['data'] == 'train']
y_train = all_data['Hazard'][all_data['data'] == 'train']
x_test = all_data.drop(['Id', 'Hazard', 'data'], axis=1)[all_data['data'] == 'test']

print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")

# --- Lasso with GridSearchCV ---
params = {'alpha': np.linspace(0.01, 10, 100)}
model = Lasso(fit_intercept=True)

grid_search = GridSearchCV(
    model,
    param_grid=params,
    scoring='neg_mean_absolute_error',
    cv=10,
    n_jobs=-1,
    verbose=10
)

grid_search.fit(x_train, y_train)

# --- Output best results ---
print("Best Lasso alpha:", grid_search.best_params_)
best_mae = abs(grid_search.best_score_)
print("Best Lasso MAE on CV:", best_mae)

project_score = 1 - (best_mae / 5.4)
print("Project Score = 1 - (MAE/5.4):", project_score)



x_train shape: (40799, 91), y_train shape: (40799,)
x_test shape: (10200, 91)
Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best Lasso alpha: {'alpha': 0.01}
Best Lasso MAE on CV: 2.748947530559763
Project Score = 1 - (MAE/5.4): 0.49093564248893284


NameError: name 'submissions' is not defined

In [3]:
# --- Predict test and save ---
submission = pd.DataFrame({
    'Id': bd_test['Id'],
    'Hazard': grid_search.predict(x_test)
})

submission.to_csv(r'E:/IITK/P4 Data/benchmark_submission.csv', index=False)
print("Saved: benchmark_submission.csv")

Saved: benchmark_submission.csv


In [5]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_val_score

# --- Load data ---
bd_train = pd.read_csv(r'E:/IITK/P4 Data/Hazard_train.csv')
bd_test = pd.read_csv(r'E:/IITK/P4 Data/Hazard_test_share.csv')

# --- Identify categorical variables ---
cat_cols = bd_train.select_dtypes(['object']).columns

# --- Mark train/test, combine ---
bd_train['data'] = 'train'
bd_test['data'] = 'test'
bd_test['Hazard'] = np.nan
all_data = pd.concat([bd_train, bd_test], axis=0, sort=False)

# --- Encode categorical features with frequency filter ---
for col in cat_cols:
    k = all_data[col].value_counts()
    cats = k[k >= 100].index[:-1]
    for cat in cats:
        name = f"{col}_{cat}"
        all_data[name] = (all_data[col] == cat).astype(int)
    del all_data[col]

# --- Split train/test ---
x_train = all_data.drop(['Id', 'Hazard', 'data'], axis=1)[all_data['data'] == 'train']
y_train = all_data['Hazard'][all_data['data'] == 'train']
x_test = all_data.drop(['Id', 'Hazard', 'data'], axis=1)[all_data['data'] == 'test']

print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")

# --- Build and evaluate HistGradientBoostingRegressor with Poisson loss ---
model = HistGradientBoostingRegressor(loss='poisson', max_iter=300, random_state=42)

# Use negative MAE for scoring; multiply by -1 to get positive MAE
mae_scores = -1 * cross_val_score(
    model,
    x_train,
    y_train,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1,
    verbose=10
)

mean_mae = mae_scores.mean()
print(f"Mean CV MAE: {mean_mae}")

project_score = 1 - (mean_mae / 5.4)
print(f"Project Score = 1 - (MAE/5.4): {project_score}")

# --- Fit on full training data ---
model.fit(x_train, y_train)

# --- Predict and save submission ---
submission = pd.DataFrame({
    'Id': bd_test['Id'],
    'Hazard': model.predict(x_test)
})

submission.to_csv(r'E:/IITK/P4 Data/benchmark_submissio2.csv', index=False)
print("Saved: benchmark_submission2.csv")

x_train shape: (40799, 91), y_train shape: (40799,)
x_test shape: (10200, 91)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.7s remaining:    4.1s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    3.1s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.4s finished


Mean CV MAE: 2.7092278200963507
Project Score = 1 - (MAE/5.4): 0.4982911444266017
Saved: benchmark_submission2.csv


In [9]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB 281.8 kB/s eta 0:08:52
   ---------------------------------------- 0.2/150.0 MB 919.0 kB/s eta 0:02:44
   ---------------------------------------- 0.5/150.0 MB 2.2 MB/s eta 0:01:10
   ---------------------------------------- 1.2/150.0 MB 4.8 MB/s eta 0:00:32
    --------------------------------------- 2.1/150.0 MB 7.4 MB/s eta 0:00:20
    --------------------------------------- 3.0/150.0 MB 8.3 MB/s eta 0:00:18
   - -------------------------------------- 5.8/150.0 MB 14.3 MB/s eta 0:00:11
   -- ------------------------------------- 11.0/150.0 MB 46.7 MB/s eta 0:00:03
   ---- 

In [29]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

# --- Load data ---
bd_train = pd.read_csv(r'E:/IITK/P4 Data/Hazard_train.csv')
bd_test = pd.read_csv(r'E:/IITK/P4 Data/Hazard_test_share.csv')

# --- Identify categorical columns ---
cat_cols = bd_train.select_dtypes(['object']).columns


# --- Mark and combine ---
bd_train['data'] = 'train'
bd_test['data'] = 'test'
bd_test['Hazard'] = np.nan
all_data = pd.concat([bd_train, bd_test], axis=0, sort=False)

# --- Encode categorical with frequency filter ---
for col in cat_cols:
    k = all_data[col].value_counts()
    cats = k[k >= 100].index[:-1]
    for cat in cats:
        name = f"{col}_{cat}"
        all_data[name] = (all_data[col] == cat).astype(int)
    del all_data[col]

# --- Split ---
x_train = all_data.drop(['Id', 'Hazard', 'data'], axis=1)[all_data['data'] == 'train']
y_train = all_data['Hazard'][all_data['data'] == 'train']
x_test = all_data.drop(['Id', 'Hazard', 'data'], axis=1)[all_data['data'] == 'test']

print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")

# --- XGBoost model with Poisson objective ---
model = XGBRegressor(
    objective='count:poisson',
    n_estimators=8000,          # massive number of trees
    learning_rate=0.001,        # extremely slow learning
    max_depth=12,               # very deep trees
    subsample=0.6,              # higher randomness in rows
    colsample_bytree=0.6,       # higher randomness in features
    reg_alpha=2.0,              # even stronger L1 penalty
    reg_lambda=3.0,             # even stronger L2 penalty
    min_child_weight=20,        # child nodes need more weight to split
    gamma=2.0,                  # even stricter split requirements
    random_state=42,
    n_jobs=-1
)

# --- Cross-validation MAE ---
mae_scores = -1 * cross_val_score(
    model,
    x_train,
    y_train,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1,
    verbose=10
)

mean_mae = mae_scores.mean()
print(f"Mean CV MAE: {mean_mae}")

project_score = 1 - (mean_mae / 5.4)
print(f"Project Score = 1 - (MAE/5.4): {project_score}")

# --- Fit full model ---
model.fit(x_train, y_train)

# --- Predict and save ---
submission = pd.DataFrame({
    'Id': bd_test['Id'],
    'Hazard': model.predict(x_test)
})

#submission.to_csv(r'E:/IITK/P4 Data/benchmark_submissio3.csv', index=False)
#print("Saved: benchmark_submission3.csv")

x_train shape: (40799, 91), y_train shape: (40799,)
x_test shape: (10200, 91)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  6.3min remaining:  9.5min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  6.3min remaining:  4.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.4min finished


Mean CV MAE: 2.67234060041372
Project Score = 1 - (MAE/5.4): 0.5051221110344963


In [31]:
submission.to_csv(r'E:/IITK/P4 Data/benchmark_submissio3.csv', index=False)
print("Saved: benchmark_submission3.csv")

Saved: benchmark_submission3.csv


In [27]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# --- Load data ---
bd_train = pd.read_csv(r'E:/IITK/P4 Data/Hazard_train.csv')
bd_test = pd.read_csv(r'E:/IITK/P4 Data/Hazard_test_share.csv')

# --- Identify categorical columns ---
cat_cols = bd_train.select_dtypes(['object']).columns

# --- Mark + combine ---
bd_train['data'] = 'train'
bd_test['data'] = 'test'
bd_test['Hazard'] = np.nan
all_data = pd.concat([bd_train, bd_test], axis=0, sort=False)

# --- Full one-hot encoding ---
all_data = pd.get_dummies(all_data, columns=cat_cols)

# --- Split back ---
x_train = all_data[all_data['data'] == 'train'].drop(['Id', 'Hazard', 'data'], axis=1)
y_train = all_data[all_data['data'] == 'train']['Hazard']
x_test = all_data[all_data['data'] == 'test'].drop(['Id', 'Hazard', 'data'], axis=1)

print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")

# --- Validation split ---
X_tr, X_val, y_tr, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# --- XGBoost model ---
model = XGBRegressor(
    objective='count:poisson',
    n_estimators=5000,          # aggressive but not wasteful
    learning_rate=0.002,        # slow, careful learning
    max_depth=10,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=1.0,
    reg_lambda=2.0,
    min_child_weight=10,
    gamma=1.0,
    random_state=42,
    n_jobs=-1
)

# --- Fit full trees (early stopping not available in old XGBRegressor) ---
model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=100
)

# --- Predict validation MAE ---
val_pred = model.predict(X_val)
val_mae = np.mean(np.abs(y_val - val_pred))
print(f"Validation MAE: {val_mae}")
project_score = 1 - (val_mae / 5.4)
print(f"Validation Project Score = 1 - (MAE/5.4): {project_score}")

# --- Final model on full data ---
model.fit(x_train, y_train, verbose=False)

# --- Predict test ---
submission = pd.DataFrame({
    'Id': bd_test['Id'],
    'Hazard': model.predict(x_test)
})

submission.to_csv(r'E:/IITK/P4 Data/final_xgb_full_dummies.csv', index=False)
print("Saved: final_xgb_full_dummies.csv")


x_train shape: (40799, 111), y_train shape: (40799,)
x_test shape: (10200, 111)
[0]	validation_0-poisson-nloglik:3.01539
[100]	validation_0-poisson-nloglik:2.99084
[200]	validation_0-poisson-nloglik:2.96910
[300]	validation_0-poisson-nloglik:2.95029
[400]	validation_0-poisson-nloglik:2.93409
[500]	validation_0-poisson-nloglik:2.92005
[600]	validation_0-poisson-nloglik:2.90800
[700]	validation_0-poisson-nloglik:2.89705
[800]	validation_0-poisson-nloglik:2.88759
[900]	validation_0-poisson-nloglik:2.87968
[1000]	validation_0-poisson-nloglik:2.87266
[1100]	validation_0-poisson-nloglik:2.86637
[1200]	validation_0-poisson-nloglik:2.86069
[1300]	validation_0-poisson-nloglik:2.85560
[1400]	validation_0-poisson-nloglik:2.85093
[1500]	validation_0-poisson-nloglik:2.84692
[1600]	validation_0-poisson-nloglik:2.84335
[1700]	validation_0-poisson-nloglik:2.84018
[1800]	validation_0-poisson-nloglik:2.83730
[1900]	validation_0-poisson-nloglik:2.83463
[2000]	validation_0-poisson-nloglik:2.83238
[2100]	v