In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer

In [2]:
train_data = pd.read_csv('/kaggle/input/playground-series-s5e2/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s5e2/test.csv')
sam_data = pd.read_csv('/kaggle/input/playground-series-s5e2/sample_submission.csv')

In [3]:
train_data.head(3)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732


# Missing values

In [4]:
train_data.isna().sum()

id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64

In [5]:
test_data.isna().sum()

id                         0
Brand                   6227
Material                5613
Size                    4381
Compartments               0
Laptop Compartment      4962
Waterproof              4811
Style                   5153
Color                   6785
Weight Capacity (kg)      77
dtype: int64

# Convert to numerical

In [6]:
Categorical_columns = train_data.select_dtypes(include=['object', 'category']).columns
def mapping_to_num(data, column):
    mapping = {category: idx for idx, category in enumerate(data[column].astype('category').cat.categories)}
    data[column] = data[column].map(mapping)
    return data, mapping
mappings_dict = {}
columns_to_map = Categorical_columns
for col in columns_to_map:
    train_data, mapping = mapping_to_num(train_data, col)
    mappings_dict[col] = mapping

In [7]:
for col in columns_to_map:
    test_data, mapping = mapping_to_num(test_data, col)
    mappings_dict[col] = mapping

In [8]:
train_data.head(3)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,1.0,1.0,1.0,7.0,1.0,0.0,2.0,0.0,11.611723,112.15875
1,1,1.0,0.0,2.0,10.0,1.0,1.0,1.0,3.0,27.078537,68.88056
2,2,4.0,1.0,2.0,2.0,1.0,0.0,1.0,5.0,16.64376,39.1732


In [9]:
train_data.isna().sum()

id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64

# Dealing with missing data

In [10]:
imputer = KNNImputer(n_neighbors=5)
train_imputed = pd.DataFrame(imputer.fit_transform(train_data), columns=train_data.columns)

In [11]:
train_imputed.head(5)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0.0,1.0,1.0,1.0,7.0,1.0,0.0,2.0,0.0,11.611723,112.15875
1,1.0,1.0,0.0,2.0,10.0,1.0,1.0,1.0,3.0,27.078537,68.88056
2,2.0,4.0,1.0,2.0,2.0,1.0,0.0,1.0,5.0,16.64376,39.1732
3,3.0,2.0,2.0,2.0,8.0,1.0,0.0,1.0,3.0,12.93722,80.60793
4,4.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,3.0,17.749338,86.02312


In [12]:
train_imputed.isna().sum()

id                      0
Brand                   0
Material                0
Size                    0
Compartments            0
Laptop Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight Capacity (kg)    0
Price                   0
dtype: int64

In [13]:
test_imputed = pd.DataFrame(imputer.fit_transform(test_data), columns=test_data.columns)

In [14]:
test_imputed.isna().sum()

id                      0
Brand                   0
Material                0
Size                    0
Compartments            0
Laptop Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight Capacity (kg)    0
dtype: int64

In [15]:
X = train_imputed.drop('Price', axis=1)
y = train_imputed['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 1565.8570850400617


In [17]:
prediction = model.predict(test_imputed)

In [18]:
sam_data['Price']= prediction
sam_data['id'] = test_data['id']
sam_data.to_csv('submission.csv',index=False)

In [19]:
sam_data.head()

Unnamed: 0,id,Price
0,300000,81.333069
1,300001,86.222404
2,300002,80.258693
3,300003,79.886188
4,300004,72.951912
