In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import *
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE, RFECV
import warnings
import re
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor


In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,0.55,0.4125,0.1625,1.715145,0.609514,0.396893,0.56699,4.0
1,1,F,1.5125,1.2125,0.4,31.312023,13.395139,6.265239,8.930093,10.0
2,2,F,1.5125,1.175,0.4125,31.552993,14.670866,6.279414,9.922325,11.0
3,3,I,0.8,0.6,0.2,4.620969,3.019222,0.978058,1.417475,7.0
4,4,I,1.3875,1.0875,0.3625,24.323871,11.651644,5.712424,5.386405,8.0


In [3]:
df['Sex'].value_counts()

Sex
M           5387
I           5050
F           4562
Diameter       1
Name: count, dtype: int64

In [4]:
df_test = pd.read_csv('test.csv')
df_test

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight
0,15000,I,0.8625,0.6500,0.2250,5.854172,2.721552,1.048931,1.743494
1,15001,F,1.2875,1.0000,0.3250,20.326591,9.412034,4.578444,5.244657
2,15002,I,0.7000,0.5250,0.1500,2.820775,1.091456,0.666213,0.850485
3,15003,F,1.2625,0.9625,0.3375,18.710670,9.908150,4.521745,4.677668
4,15004,I,0.9125,0.6625,0.2250,5.060386,2.197086,1.176504,1.700970
...,...,...,...,...,...,...,...,...,...
9995,24995,I,1.3000,1.0000,0.3125,18.710670,8.348928,3.713785,5.386405
9996,24996,I,1.2500,0.9375,0.2750,14.755915,6.520385,2.849125,4.195726
9997,24997,M,1.5125,1.2625,0.4375,40.837455,18.866592,10.531839,9.780577
9998,24998,M,1.4500,1.0625,0.3375,28.689694,14.231449,5.570677,7.087375


In [5]:
df['Age'].describe()

count    15000.000000
mean         9.929533
std          3.206668
min          1.000000
25%          8.000000
50%          9.000000
75%         11.000000
max         29.000000
Name: Age, dtype: float64

In [6]:
label_encoder = LabelEncoder()

df['Sex_num'] = label_encoder.fit_transform(df['Sex'])
df_test['Sex_num'] = label_encoder.fit_transform(df_test['Sex'])

In [7]:
col = ['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight', 'Sex_num']
X = df[col]
y = df['Age']

liner_model = LinearRegression()
liner_model.fit(X, y)

X_test = df_test[col]
y_pred_liner = liner_model.predict(X_test)
y_pred_liner.mean()

np.float64(9.935198086255618)

In [8]:
col = ['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']
X = df[col]
y = df['Age']

# Model yaratish
linear_model = LinearRegression()

# Kross-validatsiyani bajarish (5-fold)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(linear_model, X, y, cv=cv, scoring='neg_mean_absolute_error')

# Natijalarni ko'rsatish
print(f"Kross-validatsiya MAE (5-fold): {np.abs(cv_scores)}")
print(f"O'rtacha MAE: {np.abs(cv_scores).mean()}")

# Modelni o'qitish va test ma'lumotlar to'plamida bashorat qilish
linear_model.fit(X, y)

X_test = df_test[col]
y_pred = linear_model.predict(X_test)
y_pred.mean()

Kross-validatsiya MAE (5-fold): [1.38984797 1.44938118 1.43860303 1.41284078 1.44495207]
O'rtacha MAE: 1.4271250075410908


np.float64(9.89853998588371)

## Ridge

In [9]:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X, y)

y_pred_ridge = ridge_model.predict(X_test)
y_pred_ridge.mean()

np.float64(9.898577502796545)

## Lasso

In [10]:
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X, y)

y_pred_lasso = lasso_model.predict(X_test)
y_pred_lasso.mean()


np.float64(9.890903530440253)

## Huber

In [11]:
huber_model = HuberRegressor()
huber_model.fit(X, y)

y_pred_huber = huber_model.predict(X_test)
y_pred_huber.mean()

np.float64(9.576665517109022)

## Ransac

In [12]:
ransac_model = RANSACRegressor()

ransac_model.fit(X, y)

y_pred_ransac = ransac_model.predict(X_test)
y_pred_ransac.mean()

np.float64(9.660844409104048)

## Elastic

In [13]:
elastic_net_model = ElasticNet(alpha=1.0, l1_ratio=0.5)

elastic_net_model.fit(X, y)

y_pred_elastic_net = elastic_net_model.predict(X_test)
y_pred_elastic_net.mean()

np.float64(9.889452684333687)

## Ansambl

In [14]:
linear_pipeline = Pipeline([('liner_model', LinearRegression())])
ridge_pipeline = Pipeline([('ridge_model', Ridge(alpha=1.0))])
lasso_pipeline = Pipeline([('lasso_model', Lasso(alpha=0.1))])
huber_pipeline = Pipeline([('huber_model', HuberRegressor())])
ransac_pipeline = Pipeline([('ransacr_model', RANSACRegressor())])
elastic_pipeline = Pipeline([('elastic_model', ElasticNet())])

# Define the meta-model
meta_model = HuberRegressor()

# Create the stacking regressor
stacking_regressor = StackingRegressor(
    estimators=[
        ('linear', linear_pipeline),
        ('ridge', ridge_pipeline),
        ('lasso', lasso_pipeline),
        ('huber', huber_pipeline),
        ('ransac', ransac_pipeline),
        ('elastic', elastic_pipeline)
    ],
    final_estimator=meta_model,
    cv=5
)

# Train the stacking model
stacking_regressor.fit(X, y)

y_pred_stec = stacking_regressor.predict(X_test)
y_pred_stec.mean()
y_pred_stec

array([6.56723878, 8.5364959 , 5.48667222, ..., 9.80264395, 8.54309607,
       8.06313902])

## Huber * Polinomial

In [15]:
poly_degree = 2  # Polinomial daraja
polynomial_features = PolynomialFeatures(degree=poly_degree)
huber_regressor = HuberRegressor()

pipeline_huber = Pipeline([
    ("polynomial_features", polynomial_features),
    ("huber_regressor", huber_regressor)
])

# Modelni fit qilish
pipeline_huber.fit(X, y)

# Bashorat qilish
y_pred_huberp = pipeline_huber.predict(X_test)
y_pred_huberp.mean()

np.float64(9.557784211864561)

In [16]:
from sklearn.ensemble import StackingRegressor

# Define the pipelines
linear_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', LinearRegression())
])

ridge_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', Ridge(alpha=1.0))
])

lasso_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', Lasso())
])

huber_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', HuberRegressor())
])

ransac_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', RANSACRegressor())
])

elastic_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('elastic_net', ElasticNet())
])

# Stacking Regressor
stacking_regressor = StackingRegressor(
    estimators=[
        ('linear', linear_pipeline),
        ('lasso', lasso_pipeline),
        ('huber', huber_pipeline),
        ('ridge', ridge_pipeline),
        ('elastic_net', elastic_pipeline)
    ],
    final_estimator=HuberRegressor(),  # Bu yerda final estimator berilishi kerak
    cv=5
)

# Fit the stacking model
stacking_regressor.fit(X, y)

# Predict
y_pred_stec_huber = stacking_regressor.predict(X_test).round()



In [17]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Define the pipelines with StandardScaler added
linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Add scaler
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', LinearRegression())
])

ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Add scaler
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', Ridge(alpha=1.0))
])

lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Add scaler
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', Lasso())
])

huber_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Add scaler
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', HuberRegressor())
])

ransac_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Add scaler
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', RANSACRegressor())
])

elastic_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Add scaler
    ('poly', PolynomialFeatures(degree=2)),
    ('elastic_net', ElasticNet())
])


# Stacking Regressor
stacking_regressor = StackingRegressor(
    estimators=[
        ('linear', linear_pipeline),
        ('lasso', lasso_pipeline),
        ('huber', huber_pipeline),
        ('ridge', ridge_pipeline),
        ('elastic_net', elastic_pipeline)
    ],
    final_estimator=HuberRegressor(),
    cv=5
)

# Assuming X_train, y_train, X_test, y_test are defined

# Fit the stacking model
stacking_regressor.fit(X, y)

# Predict
y_pred_stec_huber1 = stacking_regressor.predict(X_test).round()


In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor, ElasticNet, RANSACRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Load data
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df = df[df['Sex'] != 'Diameter']
# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']),
        ('cat', OneHotEncoder(), ['Sex'])
    ]
)

# Define the pipelines with StandardScaler added
linear_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', LinearRegression())
])

ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('ridge', Ridge(alpha=1.0))
])

lasso_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('lasso', Lasso())
])

huber_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('huber', HuberRegressor())
])

ransac_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('ransac', RANSACRegressor())
])

elastic_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('elastic_net', ElasticNet())
])

# Split the data
X = df.drop(columns=['Age'])  # Assuming 'Age' is the target variable
y = df['Age']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Stacking Regressor
stacking_regressor = StackingRegressor(
    estimators=[
        ('linear', linear_pipeline),
        ('lasso', lasso_pipeline),
        ('huber', huber_pipeline),
        ('ridge', ridge_pipeline),
        ('elastic_net', elastic_pipeline)
    ],
    final_estimator=HuberRegressor(),
    cv=10
)

# Fit the stacking model
stacking_regressor.fit(X_train, y_train)

# Predict
y_pred_test = stacking_regressor.predict(X_test).round()

# Evaluate
mae = mean_absolute_error(y_test, y_pred_test)
print(f'Mean Absolute Error: {mae}')

# Predict on new test data
y_pred_final = stacking_regressor.predict(df_test).round()

Mean Absolute Error: 1.246


In [19]:
subm = pd.read_csv('sample_submission.csv')

subm['Age'] = y_pred_final

subm

subm.to_csv('EAB_submit.csv')

df1 = pd.read_csv('EAB_submit.csv')
df1.head()

Unnamed: 0.1,Unnamed: 0,id,Age
0,0,15000,6.0
1,1,15001,9.0
2,2,15002,5.0
3,3,15003,8.0
4,4,15004,6.0


In [20]:
df1 = df1.drop(columns=['Unnamed: 0'])
df1.to_csv('EAB_submission.csv', index=False)

In [21]:
df1

Unnamed: 0,id,Age
0,15000,6.0
1,15001,9.0
2,15002,5.0
3,15003,8.0
4,15004,6.0
...,...,...
9995,24995,9.0
9996,24996,8.0
9997,24997,10.0
9998,24998,9.0
