## Linear Regression to predict the price of a car

In [1]:
# load the dependencies
# import numpy and pandas
import numpy as np
import pandas as pd
# import plotly
import plotly.express as px
from plotly.subplots import make_subplots
# import sklearn and related modules
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, accuracy_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.plotting.backend = 'plotly'

In [2]:
# load the data
_DATASET_FILENAME = "UsedCarData.csv"
# car types keys and values
_CAR_TYPES = {
      'SUV / Crossover': 1,
      'Sedan': 2,
      'Coupe': 3,
      'Hatchback': 4,
      'Pickup Truck': 5,
      'Minivan': 6,
      'Convertible': 7,
      'Wagon': 8,
      'Van': 9
    }

In [None]:
# Function to plot graphs from the loaded data
# plot average price of cars based on the number of owners
def bar(df, x_value, y_value, title, labels={}):
  bar = df.plot(kind='bar', x=x_value, y=y_value,
                           title=title, barmode='group', labels=labels)

  bar.update_layout(template="plotly_dark")
  bar.show()


# plot a scatter plot
def scatter(df, x_value, y_value, title_value):
  # trendline_optons controls level of smoothing, default is 0.666
  scatter = px.scatter(df, x=x_value, y=y_value,
                       trendline='lowess', trendline_options=dict(frac=0.1), title=title_value)
  scatter.update_layout(template="plotly_dark")
  scatter.show()


# plot a heatmap representing correlation between the variables
def heat(df):
  heat = px.imshow(df.corr(), title="Correlation of Data")
  heat.update_layout(template="plotly_dark")
  heat.show()

In [None]:
# Define a function for linear regression
def regression(df):
  model = LinearRegression()
  scaler = StandardScaler()

  df = df.dropna(axis=0, how='any')

  # take only the relevant features for price prediction
  X = df[['year', 'mileage', 'owner_count', 'is_new', 'daysonmarket', 'horsepower']]
  y = df['price']

  scaler.fit(X)
  standardized_data = scaler.transform(X) # standardize the features
  X = standardized_data

  # train test split
  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
  model.fit(X_train, y_train)

  # predicted values
  y_pred = model.predict(X_test)

  # Base formula: Adj r2 = 1-(1-r2)*(n-1)/(n-p-1) where n is sample size and p is num of independent variables
  print(f'adjusted r^2 score: {1-(1-r2_score(y_test, y_pred))*((len(df.mileage)-1)/(len(df.mileage)-6-1))}')

In [None]:
# main
def main():
  # Try-catch to see if the data-file is present in colab
  try:
    # dropna() gets rid of None values
    car_data = pd.read_csv(_DATASET_FILENAME).dropna(axis=1, how='all')
  except FileNotFoundError:
    print("File not found.")
    return


  car_data = car_data.replace({'body_type' : _CAR_TYPES})
  print(car_data.describe(exclude=None).drop(index='count'))

  # Compute the average price for each num of owner
  avg_prices = []
  for num_owners in range(6):
    avg_prices.append(car_data.loc[car_data.owner_count == (num_owners + 1), 'price'].mean())
  bar(car_data, [1,2,3,4,5,6], avg_prices, "Average Price to Num. Owners", labels={'x' : 'Number of owners','y' : ' Average Price'})

  scatter(car_data, car_data.mileage, car_data.price, "Price to Mileage")
  scatter(car_data, car_data.horsepower, [car_data.city_fuel_economy, car_data.highway_fuel_economy], "Fuel economy to Horsepower")
  scatter(car_data, car_data.horsepower, car_data.engine_displacement, "Displacement to Horsepower")

  # Compute the average price for each car type
  avg_prices.clear()
  for body_type in range(len(_CAR_TYPES.values())):
    avg_prices.append(car_data.loc[car_data.body_type == (body_type + 1), 'price'].mean())
  bar(car_data, _CAR_TYPES.keys(), avg_prices, "Price to Car Type", labels={'y':' Average Price', 'x':'Car Type'})

  # Compute the average rating for each car type
  avg_rating = []
  for body_type in range(len(_CAR_TYPES.values())):
    avg_rating.append(car_data.loc[car_data.body_type == (body_type + 1), 'seller_rating'].mean())
  bar(car_data, _CAR_TYPES.keys(), avg_rating, "Rating to Car Type", labels={'y':' Average Rating', 'x':'Car Type'})

  #heat(car_data)
  regression(car_data)


if __name__ == "__main__":
  main()


      body_type  city_fuel_economy  daysonmarket    dealer_zip  \
mean   2.074297          20.699594     82.293000   8954.493000   
std    1.812991           5.780224    131.988037   3464.822597   
min    1.000000          12.000000      0.000000    922.000000   
25%    1.000000          17.000000     14.000000   7036.000000   
50%    1.000000          20.000000     33.000000  10466.000000   
75%    2.000000          23.500000     76.000000  11706.000000   
max    9.000000          55.000000   1252.000000  11797.000000   

      engine_displacement  highway_fuel_economy  horsepower   latitude  \
mean          2994.467213             27.782138  249.865779  38.594610   
std           1202.028253              5.738654   88.031844   6.882543   
min           1200.000000             14.000000   78.000000  18.346700   
25%           2000.000000             24.000000  180.000000  40.733300   
50%           2500.000000             27.000000  246.000000  40.758900   
75%           3600.000000  

adjusted r^2 score: -0.03559786030301049
