## Step 1: Data Loading & Initial Exploration


In [2]:
import pandas as pd
import numpy as np

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(train_df.shape)
print(test_df.shape)
train_df.head()

(4209, 378)
(4209, 377)


Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


### Step 2: Data Cleaning and Pre-processing

In [3]:
# Remove columns with zero variance (constant columns)
zero_var_cols = [col for col in train_df.columns if train_df[col].nunique() == 1]
print("Zero variance columns:", zero_var_cols)

train_df.drop(zero_var_cols, axis=1, inplace=True)
test_df.drop(zero_var_cols, axis=1, inplace=True)

Zero variance columns: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']


In [4]:
# Null check
print(train_df.isnull().sum())
print(test_df.isnull().sum())

# Unique value check
for col in train_df.columns:
	print(f"{col}: {train_df[col].nunique()} unique values")


ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 366, dtype: int64
ID      0
X0      0
X1      0
X2      0
X3      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 365, dtype: int64
ID: 4209 unique values
y: 2545 unique values
X0: 47 unique values
X1: 27 unique values
X2: 44 unique values
X3: 7 unique values
X4: 4 unique values
X5: 29 unique values
X6: 12 unique values
X8: 25 unique values
X10: 2 unique values
X12: 2 unique values
X13: 2 unique values
X14: 2 unique values
X15: 2 unique values
X16: 2 unique values
X17: 2 unique values
X18: 2 unique values
X19: 2 unique values
X20: 2 unique values
X21: 2 unique values
X22: 2 unique values
X23: 2 unique values
X24: 2 unique values
X26: 2 unique values
X27: 2 unique values
X28: 2 unique values
X29: 2 unique values
X30: 2 unique values
X31: 2 unique values
X32: 2 unique values
X33: 2 unique values
X34: 2 unique values
X35: 2 unique values
X36: 2 unique

### Step 3: Feature and Target Split

In [5]:
# Impute 
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

In [6]:
from sklearn.preprocessing import LabelEncoder

cat_cols = train_df.select_dtypes(include=['object']).columns

for col in cat_cols:
	le = LabelEncoder()
	all_vals = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
	le.fit(all_vals)
	train_df[col] = le.transform(train_df[col].astype(str))
	test_df[col] = le.transform(test_df[col].astype(str))

In [7]:
X = train_df.drop(['ID', 'y'], axis=1)  # 'y' is the target, 'ID' is identifier
y = train_df['y']
X_test = test_df.drop(['ID'], axis=1)


In [8]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

# This function calculates VIF
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
    return vif_data

# It's a good practice to do this after label encoding and before scaling
# Assuming 'X' is your dataframe of features after label encoding

# Calculate VIF
vif_df = calculate_vif(X)
print(vif_df.sort_values(by='VIF', ascending=False))


  vif = 1. / (1. - r_squared_i)


    feature       VIF
363    X385       inf
360    X382       inf
359    X380       inf
358    X379       inf
357    X378       inf
..      ...       ...
274    X288  1.096771
38      X42  1.089945
180    X190  1.054119
312    X332  1.038551
4        X4  1.032148

[364 rows x 2 columns]


In [9]:
X = train_df.drop(['ID', 'y'], axis=1)  # 'y' is the target, 'ID' is identifier
y = train_df['y']
X_test = test_df.drop(['ID'], axis=1)

### Step 4: Dimensionality Reduction with PCA\n

In [10]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.95, random_state=42)  # retain 95% variance
X_pca = pca.fit_transform(X_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"PCA reduced dimensions: {X_pca.shape[1]}")

PCA reduced dimensions: 148


### Step 5: Modeling and Hyperparameter Tuning\n

In [11]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

xgb = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.05, random_state=42, n_jobs=-1)

# Use negative MAE because scikit-learn scorers are maximizers
scores = cross_val_score(xgb, X_pca, y, cv=5, scoring='neg_mean_absolute_error')

print(f"Cross-Validation MAE: {-scores.mean():.4f}")

# Fit on the full training data for final prediction
xgb.fit(X_pca, y)

Cross-Validation MAE: 6.0322


#### Step 6: Hyperparameter Tuning & Final Model Training

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

# Define the parameter distribution to search
# These ranges are a good starting point
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4), # Range is start, (end-start)
    'colsample_bytree': uniform(0.6, 0.4)
}

# Initialize the XGBoost Regressor
xgb = XGBRegressor(random_state=42, n_jobs=-1)

# Set up RandomizedSearchCV
# n_iter controls how many different combinations to try
# cv is the number of cross-validation folds
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,  # You can increase this for a more thorough search
    scoring='neg_mean_absolute_error',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit the random search to the data (this will take some time)
random_search.fit(X_pca, y)

# Print the best parameters and the corresponding score
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best CV MAE: {-random_search.best_score_:.4f}")

# The best estimator is already fitted on the full training data
best_model = random_search.best_estimator_

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters found: {'colsample_bytree': np.float64(0.9332779646944658), 'learning_rate': np.float64(0.062009396052331626), 'max_depth': 3, 'n_estimators': 661, 'subsample': np.float64(0.8650089137415928)}
Best CV MAE: 5.9770


### Step 7. Generate Predictions and Submission File ---

In [13]:
# --- 8. Generate Predictions and Submission File ---
y_pred = best_model.predict(X_test_pca)

submission = pd.DataFrame({'ID': test_df['ID'], 'y': y_pred})
submission.to_csv('tuned_submission.csv', index=False)

print("Tuned submission file created successfully!")

Tuned submission file created successfully!
