In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---- ----------------------------------- 8.9/72.0 MB 46.3 MB/s eta 0:00:02
   ---------- ----------------------------- 18.1/72.0 MB 45.7 MB/s eta 0:00:02
   ------------- -------------------------- 24.6/72.0 MB 41.1 MB/s eta 0:00:02
   -------------- ------------------------- 26.0/72.0 MB 32.9 MB/s eta 0:00:02
   --------------- ------------------------ 27.8/72.0 MB 27.1 MB/s eta 0:00:02
   ---------------- ----------------------- 29.6/72.0 MB 23.8 MB/s eta 0:00:02
   ----------------- ---------------------- 31.5/72.0 MB 21.7 MB/s eta 0:00:02
   ------------------ --------------------- 33.6/72.0 MB 20.3 MB/s eta 0:00:02
   ------------------- -------------------- 35.7/72.0 MB 18.9 MB/s eta 0:00:02
   -------------------- ------------------- 37.5/72.0 MB 18.1 MB/s eta 0:

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import xgboost as xgb

In [4]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(train_df.shape, test_df.shape)

(4209, 378) (4209, 377)


In [5]:
print("Null values in Train:")
print(train_df.isnull().sum())

print("\nNull values in Test:")
print(test_df.isnull().sum())

Null values in Train:
ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 378, dtype: int64

Null values in Test:
ID      0
X0      0
X1      0
X2      0
X3      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 377, dtype: int64


In [6]:
print("\nUnique values in Train:")
print(train_df.nunique())

print("\nUnique values in Test:")
print(test_df.nunique())


Unique values in Train:
ID      4209
y       2545
X0        47
X1        27
X2        44
        ... 
X380       2
X382       2
X383       2
X384       2
X385       2
Length: 378, dtype: int64

Unique values in Test:
ID      4209
X0        49
X1        27
X2        45
X3         7
        ... 
X380       2
X382       2
X383       2
X384       2
X385       2
Length: 377, dtype: int64


In [7]:
# identify zero variance columns
zero_var_cols = [col for col in train_df.columns
                 if train_df[col].nunique() == 1]

print("Zero Variance Columns:", zero_var_cols)

# drop them from train and test
train_df = train_df.drop(columns=zero_var_cols)
test_df = test_df.drop(columns=zero_var_cols)

Zero Variance Columns: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']


In [8]:
X = train_df.drop("y", axis=1)
y = train_df["y"]

test_data = test_df.copy()

In [13]:
for col in X.columns:
    if X[col].dtype == "object":
        le = LabelEncoder()
        le.fit(X[col])

        # Find unseen categories in test set
        unseen = set(test_data[col]) - set(le.classes_)

        # Add unseen categories to encoder
        if unseen:
            le.classes_ = np.append(le.classes_, list(unseen))

        # Transform both
        X[col] = le.transform(X[col])
        test_data[col] = le.transform(test_data[col])

In [16]:
pca = PCA(n_components=50, random_state=42)
test_data.select_dtypes(include="object").columns
test_data.head()
for col in X.columns:
    if X[col].dtype == "object" or test_data[col].dtype == "object":
        le = LabelEncoder()

        # Fit only on training data
        le.fit(X[col])

        # Handle unseen labels in test set
        unseen = set(test_data[col]) - set(le.classes_)
        if unseen:
            le.classes_ = np.append(le.classes_, list(unseen))

        # Transform
        X[col] = le.transform(X[col])
        test_data[col] = le.transform(test_data[col])
X_pca = pca.fit_transform(X)
test_pca = pca.transform(test_data)

print("PCA shape:", X_pca.shape)

PCA shape: (4209, 50)


In [17]:
X_train, X_val, y_train, y_val = train_test_split(
    X_pca, y, test_size=0.2, random_state=42
)


In [18]:
model = xgb.XGBRegressor(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42
)

model.fit(X_train, y_train)

In [19]:
y_pred = model.predict(X_val)
print("R2 Score:", r2_score(y_val, y_pred))

R2 Score: 0.4578953448896732


In [20]:
test_predictions = model.predict(test_pca)
print(test_predictions[:10])

[ 98.00704   96.926346  92.4785   100.56516  109.20572   94.9303
 122.90464  102.31131  109.06364  102.43994 ]
