In [83]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [67]:
train = pd.read_csv("train.csv")

In [68]:
display(train.shape)
train.head()

(54273, 13)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


In [69]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54273 entries, 0 to 54272
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            54273 non-null  int64 
 1   brand         54273 non-null  object
 2   model         54273 non-null  object
 3   model_year    54273 non-null  int64 
 4   milage        54273 non-null  int64 
 5   fuel_type     54273 non-null  object
 6   engine        54273 non-null  object
 7   transmission  54273 non-null  object
 8   ext_col       54273 non-null  object
 9   int_col       54273 non-null  object
 10  accident      54273 non-null  object
 11  clean_title   54273 non-null  object
 12  price         54273 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 5.4+ MB


In [70]:
categorical_cols = train.select_dtypes("object").columns.tolist()
for col in categorical_cols:
    display(f"{col}: {train[col].nunique()}")

'brand: 53'

'model: 1827'

'fuel_type: 7'

'engine: 1061'

'transmission: 46'

'ext_col: 260'

'int_col: 124'

'accident: 2'

'clean_title: 1'

- Since the column `clean_title` contains only single value, we will drop this column 

In [71]:
train.drop(columns=["clean_title"], inplace=True)

- We will club values having less than 1% representation in each category
- Since the columns `model` and `engine` are almost completely unique, we will encode the values with their respective counts
- Since the "others" representation in the column `fuel_type` is less than 1%, we will drop those rows.

In [72]:
categorical_cols = train.select_dtypes("object").columns.tolist()
for col in categorical_cols:
    if col not in ["model", "engine", "fuel_type"]:
        disp = train[col].value_counts(normalize=True) * 100
        less = disp[disp < 1].index.tolist()
        if len(less) > 0:
            train[col] = train[col].apply(lambda x: "others" if x in less else x)
    elif col in ["model", "engine"]:
        train[col] = train[col].map(train[col].value_counts().to_dict())
    else:
        train[col] = train[col].apply(lambda x: "others" if x != "Gasoline" else x)

In [73]:
categorical_cols = train.select_dtypes("object").columns.tolist()
for col in categorical_cols:
    train[col] = pd.Categorical(train[col])
    train[col] = train[col].cat.codes

- Dropping the columns `ext_col` and `int_col` (personal preference: Do not see color as a substancial factor in pricing)

In [74]:
X = train.drop(columns=["id", "model", "int_col", "ext_col", "price"])
y = train["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [75]:
pl = Pipeline(
    [("scaler", StandardScaler()),
    ("model_", LinearRegression(positive=True))]
)

In [76]:
pl.fit(X_train, y_train)

In [77]:
y_pred = pl.predict(X_test)

In [78]:
root_mean_squared_error(y_test, y_pred)

47996.15084205275

In [79]:
test = pd.read_csv("test.csv")
# Steps to be done for test dataset
test.drop(columns=["clean_title"], inplace=True)
for col in ["model", "engine"]:
    test[col] = test[col].map(test[col].value_counts().to_dict())
for col in categorical_cols:
    if col != "fuel_type":
        disp = test[col].value_counts(normalize=True) * 100
        less = disp[disp < 1].index.tolist()
        if len(less) > 0:
            test[col] = test[col].apply(lambda x: "others" if x in less else x)
    else:
        test[col] = test[col].apply(lambda x: "others" if x != "Gasoline" else x)
for col in categorical_cols:
    test[col] = pd.Categorical(test[col])
    test[col] = test[col].cat.codes
X_pred = test.drop(columns=["id", "model", "int_col", "ext_col"])
y_prediction = pl.predict(X_pred)

In [80]:
submission = pd.DataFrame({
    "id": test.id.values.tolist(),
    "price": y_prediction.tolist()
})

In [81]:
submission.to_csv("arijit_submission.csv", index=False)