In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import plotly_express as px

In [42]:
df = pd.read_csv("../carsdata/used_car_dataset.csv", dtype='unicode')


In [43]:
df = df[
    [
        "price_sek",
        "mileage_km",
        "horse_power",
        "engine_size_ccm",
        "top_speed_km_h",
        "co2_emission_g/km",
        "fuel_consumption_mixed_l_100km",
        "fuel_consumption_highway_l_100km",
    ]
]
df.describe().T


Unnamed: 0,count,unique,top,freq
price_sek,134243,8187,129900.0,1406
mileage_km,134243,22650,0.0,13723
horse_power,115058,415,191.0,12537
engine_size_ccm,72110,473,1968.0,7471
top_speed_km_h,73306,218,210.0,5076
co2_emission_g/km,19262,293,149.0,706
fuel_consumption_mixed_l_100km,70854,210,5.7,2703
fuel_consumption_highway_l_100km,66892,105,4.5,3143


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134243 entries, 0 to 134242
Data columns (total 8 columns):
 #   Column                            Non-Null Count   Dtype 
---  ------                            --------------   ----- 
 0   price_sek                         134243 non-null  object
 1   mileage_km                        134243 non-null  object
 2   horse_power                       115058 non-null  object
 3   engine_size_ccm                   72110 non-null   object
 4   top_speed_km_h                    73306 non-null   object
 5   co2_emission_g/km                 19262 non-null   object
 6   fuel_consumption_mixed_l_100km    70854 non-null   object
 7   fuel_consumption_highway_l_100km  66892 non-null   object
dtypes: object(8)
memory usage: 8.2+ MB


In [45]:
df = df.dropna(axis=0, how='any',inplace=False)

In [46]:
df

Unnamed: 0,price_sek,mileage_km,horse_power,engine_size_ccm,top_speed_km_h,co2_emission_g/km,fuel_consumption_mixed_l_100km,fuel_consumption_highway_l_100km
132,239900,12000,126.0,999.0,190.0,138.0,5.8,4.3
146,84900,67000,157.0,1598.0,204.0,155.0,8.8,5.4
148,209900,169000,245.0,1969.0,210.0,157.0,8.8,5.5
149,98000,217000,141.0,1968.0,158.0,206.0,9.9,6.6
150,99900,155950,141.0,1956.0,205.0,119.0,5.6,3.9
...,...,...,...,...,...,...,...,...
132734,219000,91420,181.0,1969.0,220.0,117.0,4.5,4.1
133276,189800,152520,145.0,2494.0,170.0,193.0,8.7,6.5
133482,39900,256000,126.0,1798.0,200.0,177.0,10.3,5.8
133483,299900,147500,525.0,6208.0,300.0,295.0,18.7,9.1


In [47]:
df.shape

(18219, 8)

In [48]:
number_of_samples, number_of_features = df.shape[0], df.shape[1]-1
number_of_features, number_of_samples

(7, 18219)

In [49]:
df.columns

Index(['price_sek', 'mileage_km', 'horse_power', 'engine_size_ccm',
       'top_speed_km_h', 'co2_emission_g/km', 'fuel_consumption_mixed_l_100km',
       'fuel_consumption_highway_l_100km'],
      dtype='object')

In [50]:
X, y = df.drop('price_sek', axis=1), df['price_sek']

In [51]:
X.head()

Unnamed: 0,mileage_km,horse_power,engine_size_ccm,top_speed_km_h,co2_emission_g/km,fuel_consumption_mixed_l_100km,fuel_consumption_highway_l_100km
132,12000,126.0,999.0,190.0,138.0,5.8,4.3
146,67000,157.0,1598.0,204.0,155.0,8.8,5.4
148,169000,245.0,1969.0,210.0,157.0,8.8,5.5
149,217000,141.0,1968.0,158.0,206.0,9.9,6.6
150,155950,141.0,1956.0,205.0,119.0,5.6,3.9


In [52]:
y.head()

132    239900
146     84900
148    209900
149     98000
150     99900
Name: price_sek, dtype: object

## Train test split 

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [54]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13664, 7), (4555, 7), (13664,), (4555,))

In [55]:
scaler = MinMaxScaler()
scaler.fit(X_train)

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

scaled_X_train.min(), scaled_X_train.max(), scaled_X_test.min(), scaled_X_test.max()

(0.0, 1.0, -0.1848341232227489, 1.2708333333333333)

## Using Linear regression

In [56]:
model_ols = LinearRegression()
model_ols.fit(scaled_X_train, y_train)
model_ols.predict(scaled_X_test[0].reshape(1,-1))

array([337540.55671689])

In [63]:
y_test[:1]

74273    369900
Name: price_sek, dtype: object