In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
data = pd.read_csv("train.csv")

In [3]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
data.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [5]:
numeric_features = ['PRODUCT_ID', 'PRODUCT_TYPE_ID', 'PRODUCT_LENGTH']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [6]:
categorical_features = ['TITLE', 'BULLET_POINTS','DESCRIPTION']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [8]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', RandomForestRegressor(random_state=42))])

In [9]:
param_grid = {
    'regressor__n_estimators': [100, 200, 500],
    'regressor__max_depth': [10, 20, 30, None],
    'regressor__max_features': ['auto', 'sqrt'],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(model, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

In [None]:
 grid_search.fit(train_data.drop(['PRODUCT_ID', 'PRODUCT_LENGTH'], axis=1), train_data['PRODUCT_LENGTH'])



In [None]:
test_predictions = grid_search.predict(test_data.drop(['PRODUCT_ID', 'PRODUCT_LENGTH'], axis=1))
mse = mean_squared_error(test_data['PRODUCT_LENGTH'], test_predictions)
r2 = r2_score(test_data['PRODUCT_LENGTH'], test_predictions)
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

In [None]:
new_data = pd.read_csv("sample_submission.csv")
new_predictions