v2 (add model, VIN features)

# Import modules

In [1]:
import io
import boto3
import configparser

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

# Load data from s3

In [3]:
def read_csv_from_s3(section_name, bucket_name, file_path, ):
    
    parser = configparser.ConfigParser()
    parser.read('./config.ini')
    access_key = parser.get(section_name, "access_key")
    secret_key = parser.get(section_name, "secret_key")
    
    s3 = boto3.client('s3',
                      aws_access_key_id=access_key, 
                      aws_secret_access_key=secret_key)
    print("Get object")
    obj = s3.get_object(Bucket=bucket_name, Key=file_path)

    print("Read csv")
    df = pd.read_csv( io.BytesIO( obj["Body"].read() ) )
    
    print("Completed")
    print("Data size:", df.shape)

    return df

In [4]:
section_name = "aws_boto_credentials"
bucket_name = "ev-depreciation-model"
file_path = "vehicles.csv"

df = read_csv_from_s3(section_name, bucket_name, file_path)

Get object
Read csv
Completed
Data size: (426880, 26)


# Preprocess

In [5]:
df_pre = df.copy()

## 1. region
grouping region

* ('st louis, MO', 'st louis') -> stlouis
* ('fort smith', 'fort smith, AR') -> fortsmith
* ('kansas city', 'kansas city, MO') -> kansascity

In [6]:
def str_replace(df, col, list_str, to_str=""):
    
    print(f"(before): {df[col].nunique()}", end=" -> ")
    for s in list_str:
        df[col] = df[col].str.replace(s, to_str)

    print(f"(after): {df[col].nunique()}")
    
    return df

In [7]:
str_rplc = [", MO", ", AR"]
col = "region"

df_pre = str_replace(df_pre, col, str_rplc)

(before): 404 -> (after): 401


## 2. manufacturer

In [8]:
col = "manufacturer"

print(f"(before): {df_pre[col].nunique()}", end=" -> ")
df_pre.loc[df_pre[col]=="rover", col] = "land rover"
print(f"(after): {df_pre[col].nunique()}")

(before): 42 -> (after): 41


## 3. VIN

In [9]:
col = "VIN"

df_pre["checked_VIN"] = ~(df_pre[col].apply(lambda x: str(x).isdigit()) | df_pre[col].isnull())*1
df_pre["checked_VIN"].value_counts()

1    265184
0    161696
Name: checked_VIN, dtype: int64

## 4. posting_date

In [10]:
col = "posting_date"
df_pre["posting_year"] = pd.to_datetime(df_pre[col], utc=True).dt.year
df_pre["posting_day"] = pd.to_datetime(df_pre[col], utc=True).dt.dayofyear

## 5. Remove outlier

In [11]:
df_pre.columns

Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'VIN', 'drive', 'size', 'type', 'paint_color',
       'image_url', 'description', 'county', 'state', 'lat', 'long',
       'posting_date', 'checked_VIN', 'posting_year', 'posting_day'],
      dtype='object')

In [12]:
df_pre.condition

0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
          ... 
426875    good
426876    good
426877    good
426878    good
426879    good
Name: condition, Length: 426880, dtype: object

In [15]:
cols_to_train = ["price", "year", "manufacturer", "model", 
                 "odometer", "condition", "title_status",
                 "cylinders", "fuel", "transmission", "drive", 
                 "size", "type", "paint_color", "region", "state",
                 "VIN", "checked_VIN", "posting_year", "posting_day"]

In [16]:
idx_prc = (df.price >= 500) & (df.price < 1e5)
idx_odo = (df.odometer >= 100) & (df.odometer < 3e5)

rows_to_train = idx_prc & idx_odo

In [17]:
df_train = df_pre.loc[rows_to_train, cols_to_train].copy()
df_train.shape

(374526, 20)

## Upload data to s3

In [18]:
def write_csv_to_s3(section_name, bucket_name, file_path, data):
    
    parser = configparser.ConfigParser()
    parser.read('./config.ini')
    access_key = parser.get(section_name, "access_key")
    secret_key = parser.get(section_name, "secret_key")
    
    s3 = boto3.client('s3',
                      aws_access_key_id=access_key, 
                      aws_secret_access_key=secret_key)

    print("Put object")
    s3.put_object(
        Body=data.to_csv(index=False).encode(),
        Bucket=bucket_name, Key=file_path)

    print("Completed")

In [19]:
section_name = "aws_boto_credentials"
bucket_name = "ev-depreciation-model"
file_path = "220701_used_car_v2.csv"

write_csv_to_s3(section_name, bucket_name, file_path, df_train)

Put object
Completed
