In [1]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


In [4]:
train_df.shape

(54273, 13)

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54273 entries, 0 to 54272
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            54273 non-null  int64 
 1   brand         54273 non-null  object
 2   model         54273 non-null  object
 3   model_year    54273 non-null  int64 
 4   milage        54273 non-null  int64 
 5   fuel_type     54273 non-null  object
 6   engine        54273 non-null  object
 7   transmission  54273 non-null  object
 8   ext_col       54273 non-null  object
 9   int_col       54273 non-null  object
 10  accident      54273 non-null  object
 11  clean_title   54273 non-null  object
 12  price         54273 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 5.4+ MB


In [6]:
train_df.isnull().sum()

id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [7]:
train_df.accident.value_counts()

accident
None reported                             39896
At least 1 accident or damage reported    14377
Name: count, dtype: int64

In [8]:
train_df.clean_title.value_counts()

clean_title
Yes    54273
Name: count, dtype: int64

In [9]:
train_df.fuel_type.value_counts()

fuel_type
Gasoline          49439
Hybrid             1766
E85 Flex Fuel      1479
Diesel             1109
–                   294
Plug-In Hybrid      182
not supported         4
Name: count, dtype: int64

In [10]:
train_df = train_df.drop(columns=['id', 'clean_title'])

In [11]:
train_df.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,8250
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,15000
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,63500
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,7850


In [12]:
train_df.engine

0            375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel
1        300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...
2             300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel
3        335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...
4            200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel
                               ...                        
54268         445.0HP 4.4L 8 Cylinder Engine Gasoline Fuel
54269    220.0HP 2.0L 4 Cylinder Engine Flex Fuel Capab...
54270        420.0HP 3.6L V6 Cylinder Engine Gasoline Fuel
54271                                 4.0L H6 24V GDI DOHC
54272    261.0HP 2.0L 4 Cylinder Engine Gasoline/Mild E...
Name: engine, Length: 54273, dtype: object

##### Feature Engineering

In [13]:
# Define functions to extract HP, liters, and cylinders
def extract_hp(engine_str):
    match = re.search(r'(\d+\.?\d*)HP', engine_str)
    return float(match.group(1)) if match else None

def extract_liters(engine_str):
    match = re.search(r'(\d+\.?\d*)L', engine_str)
    return float(match.group(1)) if match else None

def extract_cylinders(engine_str):
    match = re.search(r'(\d+) Cylinder', engine_str)
    return int(match.group(1)) if match else None

# Apply the functions to create new columns
train_df['engine_hp'] = train_df['engine'].apply(extract_hp)
train_df['engine_liters'] = train_df['engine'].apply(extract_liters)
train_df['engine_cylinders'] = train_df['engine'].apply(extract_cylinders)

# Drop the original engine column
train_df.drop('engine', axis=1, inplace=True)

train_df.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,price,engine_hp,engine_liters,engine_cylinders
0,Ford,F-150 Lariat,2018,74349,Gasoline,10-Speed A/T,Blue,Gray,None reported,11000,375.0,3.5,6.0
1,BMW,335 i,2007,80000,Gasoline,6-Speed M/T,Black,Black,None reported,8250,300.0,3.0,6.0
2,Jaguar,XF Luxury,2009,91491,Gasoline,6-Speed A/T,Purple,Beige,None reported,15000,300.0,4.2,8.0
3,BMW,X7 xDrive40i,2022,2437,Hybrid,Transmission w/Dual Shift Mode,Gray,Brown,None reported,63500,335.0,3.0,6.0
4,Pontiac,Firebird Base,2001,111000,Gasoline,A/T,White,Black,None reported,7850,200.0,3.8,6.0


In [14]:
train_df.isnull().sum()

brand                  0
model                  0
model_year             0
milage                 0
fuel_type              0
transmission           0
ext_col                0
int_col                0
accident               0
price                  0
engine_hp           4057
engine_liters        606
engine_cylinders    4175
dtype: int64

In [15]:
train_df['engine_hp'] = train_df['engine_hp'].fillna(0)
train_df['engine_liters'] = train_df['engine_liters'].fillna(0)
train_df['engine_cylinders'] = train_df['engine_cylinders'].fillna(0)

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Define the categorical columns
categorical_columns = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident']

# Define the ColumnTransformer with OneHotEncoder
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False), categorical_columns)
    ],
    remainder='passthrough'  # Keep the other columns (model_year, milage, engine_hp, engine_liters, engine_cylinders) as they are
)

# Apply the ColumnTransformer to the dataset
train_df_encoded = column_transformer.fit_transform(train_df)

# Convert the result back to a DataFrame
train_df_encoded = pd.DataFrame(train_df_encoded, columns=column_transformer.get_feature_names_out())

train_df_encoded.head()

Unnamed: 0,cat__brand_Acura,cat__brand_Alfa,cat__brand_Aston,cat__brand_Audi,cat__brand_BMW,cat__brand_Bentley,cat__brand_Bugatti,cat__brand_Buick,cat__brand_Cadillac,cat__brand_Chevrolet,...,cat__int_col_Yellow,cat__int_col_–,cat__accident_At least 1 accident or damage reported,cat__accident_None reported,remainder__model_year,remainder__milage,remainder__price,remainder__engine_hp,remainder__engine_liters,remainder__engine_cylinders
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,2018.0,74349.0,11000.0,375.0,3.5,6.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,2007.0,80000.0,8250.0,300.0,3.0,6.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,2009.0,91491.0,15000.0,300.0,4.2,8.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,2022.0,2437.0,63500.0,335.0,3.0,6.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,2001.0,111000.0,7850.0,200.0,3.8,6.0


In [17]:
y = train_df_encoded['remainder__price']
X = train_df_encoded.drop(columns='remainder__price')

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

KeyboardInterrupt: 