In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('data.csv')
len(df)

11914

In [None]:
df.head()

In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')


string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [None]:
df.head()

### Exploratory Data Analysis

In [None]:
plt.figure(figsize=(6,4))

sns.histplot(df.msrp, bins=40, color='black', alpha=1)
plt.ylabel('Frequency')
plt.xlabel('Price')
plt.title('Distribution of prices')

plt.show()

In [None]:
plt.figure(figsize=(6, 4))

sns.histplot(df.msrp[df.msrp < 100000], bins=40, color='black', alpha=1)
plt.ylabel('Frequency')
plt.xlabel('Price')
plt.title('Distribution of prices')

plt.show()

In [None]:
log_price = np.log1p(df.msrp)

plt.figure(figsize=(6, 4))

sns.histplot(log_price, bins=40, color='black', alpha=1)
plt.ylabel('Frequency')
plt.xlabel('Log(Price + 1)')
plt.title('Distribution of prices after log transformation')

plt.show()

In [None]:
df.isnull().sum()

### Validation Framework

In [None]:
np.random.seed(2)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [None]:
y_train_orig = df_train.msrp.values
y_val_orig = df_val.msrp.values
y_test_orig = df_test.msrp.values

y_train = np.log1p(df_train.msrp.values)
y_val = np.log1p(df_val.msrp.values)
y_test = np.log1p(df_test.msrp.values)

del df_train['msrp']
del df_val['msrp']
del df_test['msrp']

In [None]:
df_train.iloc[10]

### Linear Regression

In [None]:
xi = [453, 11, 86]

In [None]:
w0 = 7.17
w = [0.01, 0.04, 0.002]

In [None]:
def linear_regression(xi):
    n = len(xi)
    
    pred = w0
    
    for j in range(n):
        pred = pred + w[j] * xi[j]
    return pred

In [None]:
linear_regression(xi)

In [None]:
np.expm1(12.312)

In [None]:
np.log1p(222347.2221101062)

### Linear Regression Vector Form

In [None]:
def dot(xi, w):
    n = len(xi)
    
    res = 0.0
    
    for j in range(n):
        res = res + xi[j] * w[j]
        
    return res

In [None]:
def linear_regression(xi):
    return w0 + dot(xi, w)

In [None]:
w_new = [w0] + w

In [None]:
w_new

In [None]:
def linear_regression(xi):
    xi = [1] + xi
    return dot(xi, w_new)

In [None]:
linear_regression(xi)

In [None]:
x1 = [1, 148, 24, 1385]
x2 = [1, 132, 25, 2031]
x10 = [1, 453, 11, 86]

X = [x1, x2, x10]
X = np.array(X)
X

In [None]:
X.dot(w_new)

### Train a Linear Regression Model

In [None]:
def train_linear_regression(X, y):
    pass

In [None]:
X = [
    [148, 24, 1385],
    [132, 25, 2031],
    [453, 11, 86],
    [158, 24, 185],
    [172, 25, 201],
    [413, 11, 86],
    [38, 54, 185],
    [142, 25, 431],
    [453, 31, 86],
] 

X = np.array(X)
X

In [None]:
ones = np.ones(X.shape[0])
ones

In [None]:
X = np.column_stack([ones, X])

In [None]:
y = [10000, 20000, 15000, 20050, 10000, 20000, 15000, 25000, 12000]

In [None]:
XTX = X.T.dot(X)

In [None]:
XTX_inv = np.linalg.inv(XTX)

In [None]:
w_full = XTX_inv.dot(X.T).dot(y)

In [None]:
w0 = w_full[0]
w = w_full[1:]

In [None]:
w0, w

In [None]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [None]:
train_linear_regression(X, y)

### Car Price Baseline Model

In [None]:
df_train.dtypes

In [None]:
base = ['engine_hp', 'engine_cylinders', 'highway_mpg',
        'city_mpg', 'popularity']

In [None]:
X_train = df_train[base].values

In [None]:
X_train = df_train[base].fillna(0).values

In [None]:
w0, w = train_linear_regression(X_train, y_train)

In [None]:
y_pred = w0 + X_train.dot(w)

In [None]:
sns.histplot(y_pred, color='red', alpha=0.5, bins=50)
sns.histplot(y_train, color='blue', alpha=0.5, bins=50)

### RMSE

In [None]:
def rmse(y, y_pred):
    se = ( y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [None]:
rmse(y_train, y_pred)

### Validating the Model

In [None]:
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [None]:
# Train
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

# Validation
X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

# Root Mean Squared Error
rmse(y_val, y_pred)

### Simple Feature Engineering

In [None]:
df = 2017 - df_train.year

In [None]:
def prepare_X(df):
    df = df.copy()
    
    df['age'] = 2017 - df.year
    features = base + ['age']
    
    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [None]:
# Train
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

# Validation
X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

# Root Mean Squared Error
rmse(y_val, y_pred)

In [None]:
sns.histplot(y_pred, color='red', alpha=0.5, bins=50)
sns.histplot(y_val, color='blue', alpha=0.5, bins=50)

### Categorical Variables

In [None]:
for v in [2, 3, 4]:
    df_train['num_doors_%s' % v] = (df_train.number_of_doors == v).astype('int')

In [None]:
def prepare_X(df):
    df = df.copy()
    features = base.copy()
    
    df['age'] = 2017 - df.year
    features.append('age')
    
    for v in [2, 3, 4]:
        df_train['num_doors_%s' % v] = (df_train.number_of_doors == v).astype('int')
        features.append('num_doors_%s' % v)
    
    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values
    
    return X

In [None]:
# Train
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

# Validation
X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

# Root Mean Squared Error
rmse(y_val, y_pred)