In [50]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [34]:
train=pd.read_csv("/kaggle/input/mse-2-dataset-1/train.csv")
test=pd.read_csv("/kaggle/input/mse-2-dataset-1/test.csv")

In [35]:
train.isnull().sum()

Area                 170
Perimeter            275
Major_Axis_Length    248
Minor_Axis_Length    233
Convex_Area          150
Equiv_Diameter       133
Eccentricity         109
Solidity             270
Extent               224
Roundness            252
Aspect_Ration         97
Compactness          286
Class                272
dtype: int64

In [36]:
test.isnull().sum()

id                   0
Area                 0
Perimeter            0
Major_Axis_Length    0
Minor_Axis_Length    0
Convex_Area          0
Equiv_Diameter       0
Eccentricity         0
Solidity             0
Extent               0
Roundness            0
Aspect_Ration        0
Compactness          0
dtype: int64

In [37]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2024 entries, 0 to 2023
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Area               1854 non-null   float64
 1   Perimeter          1749 non-null   float64
 2   Major_Axis_Length  1776 non-null   float64
 3   Minor_Axis_Length  1791 non-null   float64
 4   Convex_Area        1874 non-null   float64
 5   Equiv_Diameter     1891 non-null   float64
 6   Eccentricity       1915 non-null   float64
 7   Solidity           1754 non-null   float64
 8   Extent             1800 non-null   float64
 9   Roundness          1772 non-null   float64
 10  Aspect_Ration      1927 non-null   float64
 11  Compactness        1738 non-null   float64
 12  Class              1752 non-null   object 
dtypes: float64(12), object(1)
memory usage: 205.7+ KB


In [38]:
test_id=test['id']
test=test.drop(columns=['id'])

In [39]:
X=train.drop(columns=['Class'])
y=train['Class']

In [40]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [41]:
numeric_features=X.select_dtypes(include=['int64','float64']).columns
categorical_features=X.select_dtypes(include=['object']).columns

In [42]:
numerical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])
categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('encode',OneHotEncoder(handle_unknown='ignore'))
])

In [43]:
preprocessing=ColumnTransformer(transformers=[
    ('num',numerical_pipeline,numeric_features),
    ('cat',categorical_pipeline,categorical_features)
])

In [44]:
model = GradientBoostingClassifier(
    n_estimators=920,
    learning_rate=0.02,
    max_depth=4,
    min_samples_split=2,
    min_samples_leaf=1,
    subsample=0.7,
    random_state=42
)

In [45]:
pipeline=Pipeline(steps=[
    ('preprocessor',preprocessing),
    ('model',model)
])

In [47]:
not_null_index = y_train.notnull()
X_train = X_train[not_null_index]
y_train = y_train[not_null_index]

In [48]:
pipeline.fit(X_train,y_train)

In [51]:
not_null_index = y_test.notnull()
X_test = X_test.loc[not_null_index]
y_test = y_test.loc[not_null_index]

In [52]:
y_pred=pipeline.predict(X_test)

In [54]:
accu=accuracy_score(y_pred,y_test)

In [55]:
print(accu)

0.8795518207282913


In [56]:
y_final=pipeline.predict(test)

In [57]:
submission=pd.DataFrame({
    'id':test_id,
    'Class':y_final
})

In [58]:
submission.to_csv("submission.csv",index=False)