In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Importing the dataset into a pandas datafr
car_info_df = pd.read_csv('cars.csv')
car_info_df.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,...,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,...,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,...,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,...,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,...,True,False,True,True,False,False,False,False,True,7


In [3]:
# Remove Unwanted Columns - 'manufacturer_name', 'model_name', 'engine_fuel','engine_has_gas', 'engine_type', 'engine_capacity','has_warranty', 'is_exchangeable', 'state', 'location_region', drivetrain',  'number_of_photos','up_counter', 'feature_0', 'feature_1','feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7'
unwanted_columns = ['manufacturer_name', 'model_name', 'engine_fuel','engine_has_gas', 'engine_type', 'engine_capacity','has_warranty', 'is_exchangeable', 'state', 'location_region', 'drivetrain',  'number_of_photos','up_counter', 'feature_0', 'feature_1','feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', "feature_8", "feature_9"]

car_info_df_dropped = car_info_df.drop(labels=unwanted_columns, axis=1)
# car_info_df_dropped.head()

In [4]:
# Seperate X and y (features and label)  The last feature "duration_listed" is the label (y)
# Seperate X vs Y
car_info_df_label = car_info_df_dropped.iloc[:,[-1]]
car_info_df_features = car_info_df_dropped.drop(car_info_df_label.columns, axis = 1)

car_info_df_features.head()

Unnamed: 0,transmission,color,odometer_value,year_produced,body_type,price_usd
0,automatic,silver,190000,2010,universal,10900.0
1,automatic,blue,290000,2002,universal,5000.0
2,automatic,red,402000,2001,suv,2800.0
3,mechanical,blue,10000,1999,sedan,9999.0
4,automatic,black,280000,2001,universal,2134.11


In [5]:
# Do the ordinal Encoder for car type to reflect that some cars are bigger than others.  
# This is the order 'universal','hatchback', 'cabriolet','coupe','sedan','liftback', 'suv', 'minivan', 'van','pickup', 'minibus','limousine'
# make sure this is the entire set by using unique()
body_type_order = ['universal','hatchback', 'cabriolet','coupe','sedan','liftback', 'suv', 'minivan', 'van','pickup', 'minibus','limousine']
body_type_vals = car_info_df["body_type"].unique()
if sorted(body_type_order) == sorted(body_type_vals):
    print(sorted(body_type_order))
    print(sorted(body_type_vals))
    print("All values are contained in body type column")
else:
    print("Different number of attributes in data than ranked list")
    print(body_type_vals)

['cabriolet', 'coupe', 'hatchback', 'liftback', 'limousine', 'minibus', 'minivan', 'pickup', 'sedan', 'suv', 'universal', 'van']
['cabriolet', 'coupe', 'hatchback', 'liftback', 'limousine', 'minibus', 'minivan', 'pickup', 'sedan', 'suv', 'universal', 'van']
All values are contained in body type column


In [6]:
body_type_rankings = [[i, body_type] for body_type, i in enumerate(body_type_order)]
body_types_encoded = OrdinalEncoder()
body_types_encoded.fit(body_type_rankings)

body_types_encoded.categories_

[array(['cabriolet', 'coupe', 'hatchback', 'liftback', 'limousine',
        'minibus', 'minivan', 'pickup', 'sedan', 'suv', 'universal', 'van'],
       dtype=object),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype=object)]

In [7]:
# create a seperate dataframe for the ordinal number - so you must strip it out and save the column
# make sure to save the OrdinalEncoder for future encoding due to inference
numpy_col_vals = car_info_df["body_type"].to_numpy().reshape(-1, 1)
transformed_body_types =  body_types_encoded.fit_transform(numpy_col_vals)

car_info_df_body_encoded = pd.DataFrame(transformed_body_types, columns=["body_type_transformed"])

car_info_df_body_encoded.head()

Unnamed: 0,body_type_transformed
0,10.0
1,10.0
2,9.0
3,8.0
4,10.0


In [8]:
# Do onehotencoder for Transmission only - again you need to make a new dataframe with just the encoding of the transmission
# save the OneHotEncoder to use for future encoding of transmission due to inference
# car_info_df_trans_encoded = car_info_df.copy(deep=True)
transmission_encoder = OneHotEncoder()
transmission_encoder.fit(car_info_df["transmission"].to_numpy().reshape(-1, 1))
transmission_encoder.categories_

[array(['automatic', 'mechanical'], dtype=object)]

In [9]:
encoded_trans_values = transmission_encoder.fit_transform(car_info_df[['transmission']]).toarray()

In [10]:
car_info_df_trans_encoded = pd.DataFrame(encoded_trans_values, columns=transmission_encoder.get_feature_names())
#car_info_df_trans_encoded.head()

In [11]:
# Do onehotencoder for Color
# Save the OneHotEncoder to use for future encoding of color for inference
color_encoder = OneHotEncoder()
color_encoder.fit(car_info_df["color"].to_numpy().reshape(-1, 1))
#color_encoder.categories_

OneHotEncoder()

In [12]:
encoded_color_values = color_encoder.fit_transform(car_info_df[['color']]).toarray()

In [13]:
car_info_df_color_encoded = pd.DataFrame(encoded_color_values, columns=color_encoder.get_feature_names())

In [14]:
# the all three together encodings into 1 data frame (need 2 steps with "concatenate")
# add the ordinal and transmission then add color
transformed_dfs = [car_info_df_body_encoded, car_info_df_trans_encoded, car_info_df_color_encoded]

transformed_col_df = pd.concat(transformed_dfs, axis=1)
transformed_col_df.head()

Unnamed: 0,body_type_transformed,x0_automatic,x0_mechanical,x0_black,x0_blue,x0_brown,x0_green,x0_grey,x0_orange,x0_other,x0_red,x0_silver,x0_violet,x0_white,x0_yellow
0,10.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,10.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,8.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# then dd to original data set
total_df = pd.concat([car_info_df_features, transformed_col_df], axis=1)
total_df.head()

Unnamed: 0,transmission,color,odometer_value,year_produced,body_type,price_usd,body_type_transformed,x0_automatic,x0_mechanical,x0_black,...,x0_brown,x0_green,x0_grey,x0_orange,x0_other,x0_red,x0_silver,x0_violet,x0_white,x0_yellow
0,automatic,silver,190000,2010,universal,10900.0,10.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,automatic,blue,290000,2002,universal,5000.0,10.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,automatic,red,402000,2001,suv,2800.0,9.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,mechanical,blue,10000,1999,sedan,9999.0,8.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,automatic,black,280000,2001,universal,2134.11,10.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# delete the columns that are substituted by ordinal and onehot - delete the text columns for color, transmission, and car type 
preprocessed_features_df = total_df.drop(["body_type", "transmission", "color"], axis=1)
preprocessed_features_df.head()

Unnamed: 0,odometer_value,year_produced,price_usd,body_type_transformed,x0_automatic,x0_mechanical,x0_black,x0_blue,x0_brown,x0_green,x0_grey,x0_orange,x0_other,x0_red,x0_silver,x0_violet,x0_white,x0_yellow
0,190000,2010,10900.0,10.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,290000,2002,5000.0,10.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,402000,2001,2800.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,10000,1999,9999.0,8.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,280000,2001,2134.11,10.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Splitting the dataset into the Training set and Test set - use trian_test_split
# Split into train/test/validation datasets
test__perc = .20      # 20% for test split
feature_train, feature_test, label_train, label_test = train_test_split(preprocessed_features_df, car_info_df_label, test_size= test__perc)

# Feature Scaling - required due to different orders of magnitude across the features
# make sure to save the scaler for future use in inference

# get feature mean and std
feature_info = {}
feature_bt_xform_info = {}
feature_odometer_info = {}
feature_year_info = {}
feature_price_info = {}

feature_bt_xform_info["mean"] = feature_train["body_type_transformed"].mean()
feature_bt_xform_info["std"] = feature_train["body_type_transformed"].std()
feature_info["bt_xform"] = feature_bt_xform_info

feature_odometer_info["mean"] = feature_train["odometer_value"].mean()
feature_odometer_info["std"] = feature_train["odometer_value"].std()
feature_info["odometer"] = feature_odometer_info

feature_year_info["mean"] = feature_train["year_produced"].mean()
feature_year_info["std"] = feature_train["year_produced"].std()
feature_info["year"] = feature_year_info

feature_price_info["mean"] = feature_train["price_usd"].mean()
feature_price_info["std"] = feature_train["price_usd"].std()
feature_info["price"] = feature_price_info

# scale feature train inputs
feature_train.loc[:, "body_transformed_scaled"] = (feature_train["body_type_transformed"]-feature_bt_xform_info["mean"])/feature_bt_xform_info["std"] 
feature_train.loc[:, "odometer_value_scaled"] = (feature_train["odometer_value"]-feature_odometer_info["mean"])/feature_odometer_info["std"]
feature_train.loc[:, "year_produced_scaled"] = (feature_train["year_produced"]-feature_year_info["mean"])/feature_year_info["std"]
feature_train.loc[:, "price_usd_scaled"] = (feature_train["price_usd"]-feature_price_info["mean"])/feature_price_info["std"]

# scale feature test inputs with train mean/std from training set
feature_test.loc[:, "body_transformed_scaled"] = (feature_test["body_type_transformed"]-feature_bt_xform_info["mean"])/feature_bt_xform_info["std"] 
feature_test.loc[:, "odometer_value_scaled"] = (feature_test["odometer_value"]-feature_odometer_info["mean"])/feature_odometer_info["std"]
feature_test.loc[:, "year_produced_scaled"] = (feature_test["year_produced"]-feature_year_info["mean"])/feature_year_info["std"]
feature_test.loc[:, "price_usd_scaled"] = (feature_test["price_usd"]-feature_price_info["mean"])/feature_price_info["std"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [18]:
feature_train.head()

Unnamed: 0,odometer_value,year_produced,price_usd,body_type_transformed,x0_automatic,x0_mechanical,x0_black,x0_blue,x0_brown,x0_green,...,x0_other,x0_red,x0_silver,x0_violet,x0_white,x0_yellow,body_transformed_scaled,odometer_value_scaled,year_produced_scaled,price_usd_scaled
19432,320000,1997,4400.0,9.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.754509,0.51992,-0.742614,-0.349835
38263,294510,2002,3700.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.412942,0.333434,-0.122266,-0.457939
5,132449,2011,14700.0,10.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.096077,-0.852212,0.994359,1.240839
21870,340000,2001,5300.0,8.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.412942,0.666241,-0.246336,-0.210844
17968,85889,1985,386.19,10.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.096077,-1.192847,-2.231447,-0.969704


In [19]:
print(feature_train.columns)

Index(['odometer_value', 'year_produced', 'price_usd', 'body_type_transformed',
       'x0_automatic', 'x0_mechanical', 'x0_black', 'x0_blue', 'x0_brown',
       'x0_green', 'x0_grey', 'x0_orange', 'x0_other', 'x0_red', 'x0_silver',
       'x0_violet', 'x0_white', 'x0_yellow', 'body_transformed_scaled',
       'odometer_value_scaled', 'year_produced_scaled', 'price_usd_scaled'],
      dtype='object')


In [20]:
feature_train.drop(["odometer_value", "year_produced", "price_usd", "body_type_transformed"], axis=1)
print(feature_train.columns)

Index(['odometer_value', 'year_produced', 'price_usd', 'body_type_transformed',
       'x0_automatic', 'x0_mechanical', 'x0_black', 'x0_blue', 'x0_brown',
       'x0_green', 'x0_grey', 'x0_orange', 'x0_other', 'x0_red', 'x0_silver',
       'x0_violet', 'x0_white', 'x0_yellow', 'body_transformed_scaled',
       'odometer_value_scaled', 'year_produced_scaled', 'price_usd_scaled'],
      dtype='object')


In [21]:
feature_train = feature_train.drop(["odometer_value", "year_produced", "price_usd", "body_type_transformed"], axis=1)
print(feature_train.columns)
feature_test = feature_test.drop(["odometer_value", "year_produced", "price_usd", "body_type_transformed"], axis=1)

Index(['x0_automatic', 'x0_mechanical', 'x0_black', 'x0_blue', 'x0_brown',
       'x0_green', 'x0_grey', 'x0_orange', 'x0_other', 'x0_red', 'x0_silver',
       'x0_violet', 'x0_white', 'x0_yellow', 'body_transformed_scaled',
       'odometer_value_scaled', 'year_produced_scaled', 'price_usd_scaled'],
      dtype='object')


In [22]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(feature_train, label_train)

LinearRegression()

In [23]:
regressor.score(feature_test, label_test)

0.003354007740326792

In [24]:
prediction = regressor.predict(feature_test)

In [25]:
mse = mean_squared_error(label_test, prediction)

In [26]:
print(mse)

12535.956045014394
