In [12]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from xgboost import XGBRegressor
# from sklearn.metrics import mean_absolute_error

# # Load dataset
# url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"
# columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole Weight', 
#            'Shucked Weight', 'Viscera Weight', 'Shell Weight', 'Rings']
# data = pd.read_csv(url, header=None, names=columns)

# # Feature Engineering
# data['Age'] = data['Rings'] + 1.5
# data = data.drop(['Rings'], axis=1)

# # Split the data into features and target
# X = data.drop(['Age'], axis=1)
# y = data['Age']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Preprocessing: OneHotEncode categorical variables and Standardize numerical variables
# numeric_features = ['Length', 'Diameter', 'Height', 'Whole Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']
# categorical_features = ['Sex']

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numeric_features),
#         ('cat', OneHotEncoder(), categorical_features)])

# # Model pipeline
# model = Pipeline(steps=[('preprocessor', preprocessor),
#                         ('regressor', XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42))])

# # Train the model using cross-validation
# cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
# print(f'Cross-validated MAE: {-np.mean(cv_scores):.4f}')

# # Fit the model on the full training data
# model.fit(X_train, y_train)

# # Make predictions on the test data
# y_pred = model.predict(X_test)

# # Evaluate the model
# mae = mean_absolute_error(y_test, y_pred)
# print(f'Test MAE: {mae:.4f}')


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.impute import SimpleImputer


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import *
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.impute import SimpleImputer
# # Feature Engineering Function
# def feature_funk(df):
#     df['Surface Area'] = 2 * (df['Length'] * df['Diameter'] + df['Length'] * df['Height'] + df['Diameter'] * df['Height'])
#     df['Vol'] = df['Length'] * df['Diameter'] * df['Height']
#     df['Len_to_Dia_Ratio'] = df['Length'] / df['Diameter']
#     df['Hgt_to_Wt_Ratio'] = df['Height'] / df['Weight']
#     df['Sex'] = df['Sex'].map({'I': 1, 'F': 3, 'M': 2})
#     return df

# # Load Data and Apply Feature Engineering
# train_df = feature_funk(pd.read_csv('train.csv'))
# X = train_df.drop(columns=['id', 'Age'])
# y = train_df['Age']

# # Preprocessing Pipeline
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', Pipeline([
#             ('imputer', SimpleImputer(strategy='mean')),
#             ('scaler', RobustScaler())
#         ]), X.columns)
#     ])

# # Simple Model Pipeline with Huber Regressor
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', HuberRegressor(alpha=0.001, max_iter=100000))
# ])

# # Train-Test Split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2)

# # Cross-Validation
# cv = KFold(n_splits=5, shuffle=True, random_state=42)
# cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')

# # Fit Model and Evaluate
# pipeline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)

# # Evaluation Metrics
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)
# mse = mean_squared_error(y_test, y_pred)

# # Output Results
# print(f'CV MAE: {-np.mean(cv_scores):.4f}')
# print(f'Test MAE: {mae:.4f}')
# print(f'Test MSE: {mse:.4f}')
# print(f'Test R^2 Score: {r2:.4f}')


In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.impute import SimpleImputer

# Feature Engineering Function
def feature_funk(df):
    df['Surface Area'] = 2 * (df['Length'] * df['Diameter'] + df['Length'] * df['Height'] + df['Diameter'] * df['Height'])
    df['Vol'] = df['Length'] * df['Diameter'] * df['Height']
    df['Len_to_Dia_Ratio'] = df['Length'] / df['Diameter']
    df['Hgt_to_Wt_Ratio'] = df['Height'] / df['Weight']
    df['Sex'] = df['Sex'].map({'I': 1, 'F': 3, 'M': 2})
    return df

# Load Data and Apply Feature Engineering
train_df = feature_funk(pd.read_csv('train.csv'))
X = train_df.drop(columns=['id', 'Age'])
y = train_df['Age']

# Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('poly', PolynomialFeatures(degree=2, include_bias=False)),
            ('scaler', StandardScaler())
        ]), X.columns)
    ])

# Model Pipeline with Hyperparameter Tuning using GridSearchCV
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=42))
])

param_grid = {
    'model__n_estimators': [300, 500],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
    'model__alpha': [0, 0.1, 1],
    'model__lambda': [1, 1.5, 2]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Cross-Validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search.fit(X_train, y_train)

# Best Parameters
print(f'Best Parameters: {grid_search.best_params_}')

# Evaluate the Model
y_pred = grid_search.predict(X_test)

# Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

# Output Results
print(f'Improved CV MAE: {-grid_search.best_score_:.4f}')
print(f'Test MAE: {mae:.4f}')
print(f'Test MSE: {mse:.4f}')
print(f'Test R^2 Score: {r2:.4f}')


KeyboardInterrupt: 