In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor, ElasticNet, RANSACRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')


In [2]:

df = pd.read_csv('train.csv')
df.head()


Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,0.55,0.4125,0.1625,1.715145,0.609514,0.396893,0.56699,4.0
1,1,F,1.5125,1.2125,0.4,31.312023,13.395139,6.265239,8.930093,10.0
2,2,F,1.5125,1.175,0.4125,31.552993,14.670866,6.279414,9.922325,11.0
3,3,I,0.8,0.6,0.2,4.620969,3.019222,0.978058,1.417475,7.0
4,4,I,1.3875,1.0875,0.3625,24.323871,11.651644,5.712424,5.386405,8.0


In [3]:
df_test = pd.read_csv('test.csv')
df.head()


Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,0.55,0.4125,0.1625,1.715145,0.609514,0.396893,0.56699,4.0
1,1,F,1.5125,1.2125,0.4,31.312023,13.395139,6.265239,8.930093,10.0
2,2,F,1.5125,1.175,0.4125,31.552993,14.670866,6.279414,9.922325,11.0
3,3,I,0.8,0.6,0.2,4.620969,3.019222,0.978058,1.417475,7.0
4,4,I,1.3875,1.0875,0.3625,24.323871,11.651644,5.712424,5.386405,8.0


In [4]:

df = df[df['Sex'] != 'Diameter']

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']),
        ('cat', OneHotEncoder(), ['Sex'])
    ]
)


In [5]:

# Define the pipelines with StandardScaler added
linear_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', LinearRegression())
])


huber_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('huber', HuberRegressor())
])

ransac_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('ransac', RANSACRegressor())
])


In [6]:


# Split the data
X = df.drop(columns=['Age'])
y = df['Age']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Stacking Regressor
stacking_regressor = StackingRegressor(
    estimators=[
        ('linear', linear_pipeline),
        ('huber', huber_pipeline),
    ],
    final_estimator=HuberRegressor(max_iter=1000, epsilon=1.2, alpha=0.01),
    cv=5
)


In [7]:

# Fit the stacking model
stacking_regressor.fit(X_train, y_train)

# Predict
y_pred_test = stacking_regressor.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred_test)
print(f'Mean Absolute Error: {mae}')


Mean Absolute Error: 1.273107641933035


In [8]:
# to file csv
import numpy as np
y_pred_final = stacking_regressor.predict(df_test)
y_pred_final = stacking_regressor.predict(df_test)

y_pred_final = np.round(y_pred_final,1)

sub = pd.read_csv('sample_submission.csv')

sub['Age'] = y_pred_final

sub.to_csv('HAQNAZAR.csv', index=False)
sub

Unnamed: 0,id,Age
0,15000,6.3
1,15001,9.0
2,15002,5.5
3,15003,8.2
4,15004,6.4
...,...,...
9995,24995,8.7
9996,24996,7.9
9997,24997,10.3
9998,24998,9.1
