In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, confusion_matrix, accuracy_score, roc_curve, roc_auc_score,log_loss
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder, OneHotEncoder,MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet, LogisticRegression, LinearRegression
from sklearn.compose import make_column_transformer, make_column_selector
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


In [7]:
train_df = pd.read_csv("train.csv",index_col = 0)

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 188533 entries, 0 to 188532
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   brand         188533 non-null  object
 1   model         188533 non-null  object
 2   model_year    188533 non-null  int64 
 3   milage        188533 non-null  int64 
 4   fuel_type     183450 non-null  object
 5   engine        188533 non-null  object
 6   transmission  188533 non-null  object
 7   ext_col       188533 non-null  object
 8   int_col       188533 non-null  object
 9   accident      186081 non-null  object
 10  clean_title   167114 non-null  object
 11  price         188533 non-null  int64 
dtypes: int64(3), object(9)
memory usage: 18.7+ MB


In [13]:
train_df.isnull().sum()

brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [23]:
train_df['brand'].value_counts()


brand
Ford             23088
Mercedes-Benz    19172
BMW              17028
Chevrolet        16335
Audi             10887
Porsche          10612
Land              9525
Toyota            8850
Lexus             8643
Jeep              6474
Cadillac          4674
RAM               4249
Nissan            3930
Tesla             3738
INFINITI          3276
GMC               3215
Dodge             3133
Mazda             2719
Kia               2497
Lincoln           2423
Subaru            2381
Acura             2282
Honda             2101
Hyundai           2045
Volkswagen        1765
Jaguar            1319
Bentley           1155
MINI              1064
Genesis            969
Buick              940
Maserati           939
Lamborghini        809
Chrysler           727
Volvo              723
Alfa               682
Rivian             590
Rolls-Royce        561
Mitsubishi         551
Pontiac            538
Hummer             520
Ferrari            359
McLaren            243
Aston              238
Satur

In [27]:
train_df['fuel_type'].value_counts()


fuel_type
Gasoline          165940
Hybrid              6832
E85 Flex Fuel       5406
Diesel              3955
–                    781
Plug-In Hybrid       521
not supported         15
Name: count, dtype: int64

In [90]:
train_df['engine'].value_counts()


engine
355.0HP 5.3L 8 Cylinder Engine Gasoline Fuel           3462
240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel           2902
420.0HP 6.2L 8 Cylinder Engine Gasoline Fuel           2841
2.0L I4 16V GDI DOHC Turbo                             2680
375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel          2451
                                                       ... 
151.0HP 1.5L 4 Cylinder Engine Gas/Electric Hybrid        1
184.0HP 2.4L 4 Cylinder Engine Flex Fuel Capability       1
78.0HP 1.2L 3 Cylinder Engine Gasoline Fuel               1
139.0HP 1.6L 4 Cylinder Engine Plug-In Electric/Gas       1
313.0HP 2.0L 4 Cylinder Engine Plug-In Electric/Gas       1
Name: count, Length: 1117, dtype: int64

In [92]:
imputer = SimpleImputer(strategy = 'constant', fill_value = 'unknown')

In [94]:
ohe = OneHotEncoder(handle_unknown = 'infrequent_if_exist', sparse_output = False,
                   min_frequency = 10000).set_output(transform = 'pandas')
df = train_df[['engine','transmission']]
ohe_df = ohe.fit_transform(df)

In [95]:
ohe_df.columns

Index(['engine_infrequent_sklearn', 'transmission_6-Speed A/T',
       'transmission_6-Speed M/T', 'transmission_7-Speed A/T',
       'transmission_8-Speed A/T', 'transmission_A/T',
       'transmission_Automatic', 'transmission_Transmission w/Dual Shift Mode',
       'transmission_infrequent_sklearn'],
      dtype='object')

In [98]:
X_train = train_df.drop('price',axis=1)
y_train = train_df['price']

In [100]:
scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()

In [102]:
ct_ohe = make_column_transformer(('passthrough',make_column_selector(dtype_exclude = object) ),
                                  (ohe,make_column_selector(dtype_include = object) ),
                                  verbose_feature_names_out = False).set_output(transform = 'pandas')


In [104]:
X_train_ohe = ct_ohe.fit_transform(X_train)

In [105]:
X_train_ohe.isnull().sum().sum()

0

In [106]:
X_test = pd.read_csv("test.csv",index_col = 0)
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 125690 entries, 188533 to 314222
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   brand         125690 non-null  object
 1   model         125690 non-null  object
 2   model_year    125690 non-null  int64 
 3   milage        125690 non-null  int64 
 4   fuel_type     122307 non-null  object
 5   engine        125690 non-null  object
 6   transmission  125690 non-null  object
 7   ext_col       125690 non-null  object
 8   int_col       125690 non-null  object
 9   accident      124058 non-null  object
 10  clean_title   111451 non-null  object
dtypes: int64(2), object(9)
memory usage: 11.5+ MB


In [110]:
X_test_ohe = ct_ohe.fit_transform(X_test)

In [131]:
el = ElasticNet()
kfold = KFold(n_splits = 5, shuffle = True,random_state = 24)
pipe = Pipeline([('CT',ct_ohe),('EL',el)])
params = {'EL__alpha': np.linspace(0.0001,5,3),
          'EL__l1_ratio':np.linspace(0.0001,1,3)}
gcv_el = GridSearchCV(pipe,param_grid = params,verbose = 3, cv = kfold) 
gcv_el.fit(X_train,y_train)
#lr.fit(X_train_ohe,y_train)


Fitting 5 folds for each of 9 candidates, totalling 45 fits


  model = cd_fast.enet_coordinate_descent(


[CV 1/5] END EL__alpha=0.0001, EL__l1_ratio=0.0001;, score=0.088 total time=   4.2s


  model = cd_fast.enet_coordinate_descent(


[CV 2/5] END EL__alpha=0.0001, EL__l1_ratio=0.0001;, score=0.076 total time=   4.4s


  model = cd_fast.enet_coordinate_descent(


[CV 3/5] END EL__alpha=0.0001, EL__l1_ratio=0.0001;, score=0.085 total time=   4.3s


  model = cd_fast.enet_coordinate_descent(


[CV 4/5] END EL__alpha=0.0001, EL__l1_ratio=0.0001;, score=0.106 total time=   4.3s


  model = cd_fast.enet_coordinate_descent(


[CV 5/5] END EL__alpha=0.0001, EL__l1_ratio=0.0001;, score=0.086 total time=   4.2s


  model = cd_fast.enet_coordinate_descent(


[CV 1/5] END EL__alpha=0.0001, EL__l1_ratio=0.50005;, score=0.088 total time=   4.3s


  model = cd_fast.enet_coordinate_descent(


[CV 2/5] END EL__alpha=0.0001, EL__l1_ratio=0.50005;, score=0.076 total time=   4.2s


  model = cd_fast.enet_coordinate_descent(


[CV 3/5] END EL__alpha=0.0001, EL__l1_ratio=0.50005;, score=0.085 total time=   4.2s


  model = cd_fast.enet_coordinate_descent(


[CV 4/5] END EL__alpha=0.0001, EL__l1_ratio=0.50005;, score=0.106 total time=   4.2s


  model = cd_fast.enet_coordinate_descent(


[CV 5/5] END EL__alpha=0.0001, EL__l1_ratio=0.50005;, score=0.086 total time=   4.1s


  model = cd_fast.enet_coordinate_descent(


[CV 1/5] END EL__alpha=0.0001, EL__l1_ratio=1.0;, score=0.088 total time=   5.6s


  model = cd_fast.enet_coordinate_descent(


[CV 2/5] END EL__alpha=0.0001, EL__l1_ratio=1.0;, score=0.076 total time=   5.4s


  model = cd_fast.enet_coordinate_descent(


[CV 3/5] END EL__alpha=0.0001, EL__l1_ratio=1.0;, score=0.085 total time=   5.5s


  model = cd_fast.enet_coordinate_descent(


[CV 4/5] END EL__alpha=0.0001, EL__l1_ratio=1.0;, score=0.106 total time=   5.6s


  model = cd_fast.enet_coordinate_descent(


[CV 5/5] END EL__alpha=0.0001, EL__l1_ratio=1.0;, score=0.086 total time=   5.7s
[CV 1/5] END EL__alpha=2.5000500000000003, EL__l1_ratio=0.0001;, score=0.085 total time=   2.3s
[CV 2/5] END EL__alpha=2.5000500000000003, EL__l1_ratio=0.0001;, score=0.073 total time=   2.3s
[CV 3/5] END EL__alpha=2.5000500000000003, EL__l1_ratio=0.0001;, score=0.082 total time=   2.3s
[CV 4/5] END EL__alpha=2.5000500000000003, EL__l1_ratio=0.0001;, score=0.101 total time=   2.2s
[CV 5/5] END EL__alpha=2.5000500000000003, EL__l1_ratio=0.0001;, score=0.083 total time=   2.2s
[CV 1/5] END EL__alpha=2.5000500000000003, EL__l1_ratio=0.50005;, score=0.085 total time=   2.2s
[CV 2/5] END EL__alpha=2.5000500000000003, EL__l1_ratio=0.50005;, score=0.074 total time=   2.2s
[CV 3/5] END EL__alpha=2.5000500000000003, EL__l1_ratio=0.50005;, score=0.082 total time=   2.2s
[CV 4/5] END EL__alpha=2.5000500000000003, EL__l1_ratio=0.50005;, score=0.102 total time=   2.1s
[CV 5/5] END EL__alpha=2.5000500000000003, EL__l1_r

  model = cd_fast.enet_coordinate_descent(


In [136]:
print("Best Param:",gcv_el.best_params_)
print("Best Score:",gcv_el.best_score_)

Best Param: {'EL__alpha': 0.0001, 'EL__l1_ratio': 0.0001}
Best Score: 0.08826317595530367


In [138]:
bm_el = gcv_el.best_estimator_
y_pred = bm_el.predict(X_test)

In [142]:
sample = pd.read_csv('sample_submission.csv')
sample['price'] = y_pred
sample.to_csv('enr_5_nov_1.csv',index=False)