In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# Load the datasets
train_data = pd.read_csv('./Downloads/open/train.csv')
test_data = pd.read_csv('./Downloads/open/test.csv')

In [3]:
# Separate target from predictors
y = train_data['Income']
X = train_data.drop(['Income', 'ID'], axis=1)
X_test = test_data.drop('ID', axis=1)

In [4]:
# List of numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [5]:
class CustomFeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Example of custom feature engineering
        # X['new_feature'] = X['existing_feature1'] / (X['existing_feature2'] + 1)
        return X

In [6]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()),
    ('polynomial', PolynomialFeatures(degree=2, include_bias=False)),
    ('custom_features', CustomFeatureGenerator())
])

In [7]:
# Preprocessing for categorical data using OneHotEncoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [10]:
# Create a pipeline that combines the preprocessor with the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

In [11]:
# Use KFold for cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
# Evaluate the model using cross-validation
scores = cross_val_score(pipeline, X, y, scoring='neg_root_mean_squared_error', cv=cv)

In [13]:
print("Average RMSE score:", -scores.mean())

Average RMSE score: 602.657776012031


In [14]:
# Refit the model on the whole dataset
pipeline.fit(X, y)

# Make predictions
predictions = pipeline.predict(X_test)

In [16]:
# Save the predictions into a submission file
output = pd.DataFrame({'ID': test_data.ID, 'Income': predictions})
output.to_csv('./Downloads/open/final_submission.csv', index=False)