# Import modules

In [1]:
import io
import boto3
import configparser

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

# Load s3 file

In [3]:
def read_csv_from_s3(section_name, bucket_name, file_path, ):
    
    parser = configparser.ConfigParser()
    parser.read('./config.ini')
    access_key = parser.get(section_name, "access_key")
    secret_key = parser.get(section_name, "secret_key")
    
    s3 = boto3.client('s3',
                      aws_access_key_id=access_key, 
                      aws_secret_access_key=secret_key)
    print("Get object")
    obj = s3.get_object(Bucket=bucket_name, Key=file_path)

    print("Read csv")
    df = pd.read_csv( io.BytesIO( obj["Body"].read() ) )
    
    print("Completed")
    print("Data size:", df.shape)

    return df

In [4]:
section_name = "aws_boto_credentials"
bucket_name = "s3-test-ev"
file_path = "raw/vehicles.csv"

df = read_csv_from_s3(section_name, bucket_name, file_path)

Get object
Read csv
Completed
Data size: (426880, 26)


# Preprocess

In [5]:
df_pre = df.copy()

## 1. region

grouping region  
* ('st louis, MO', 'st louis') -> stlouis
* ('fort smith', 'fort smith, AR') -> fortsmith
* ('kansas city', 'kansas city, MO') -> kansascity

In [6]:
def str_replace(df, col, list_str, to_str=""):
    
    print(f"(before): {df[col].nunique()}", end=" -> ")
    for s in list_str:
        df[col] = df[col].str.replace(s, to_str)

    print(f"(after): {df[col].nunique()}")
    
    return df

In [7]:
str_rplc = [", MO", ", AR"]
col = "region"

df_pre = str_replace(df_pre, col, str_rplc)

(before): 404 -> (after): 401


## 2. manufacturer

* check manufacturer "ram"

In [8]:
col = "manufacturer"

print(f"(before): {df_pre[col].nunique()}", end=" -> ")
df_pre.loc[df_pre[col]=="rover", col] = "land rover"
print(f"(after): {df_pre[col].nunique()}")

(before): 42 -> (after): 41


## 3. VIN

In [9]:
col = "VIN"


df_pre["checked_VIN"] = ~(df_pre[col].apply(lambda x: str(x).isdigit()) | df_pre[col].isnull())*1

In [10]:
df_pre["checked_VIN"].value_counts()

1    265184
0    161696
Name: checked_VIN, dtype: int64

## 4. posting_date

In [11]:
col = "posting_date"
df_pre["posting_year"] = pd.to_datetime(df_pre[col], utc=True).dt.year
df_pre["posting_day"] = pd.to_datetime(df_pre[col], utc=True).dt.dayofyear

## 5. Remove outlier

In [12]:
cols_to_train = ["price", "region", "year", 
                 "manufacturer", "condition", "cylinders", 
                 "fuel", "odometer", "title_status",
                 "transmission", "drive", "size", 
                 "type", "paint_color", "state",
                 "checked_VIN", "posting_year", "posting_day"]

In [13]:
idx_prc = (df.price >= 500) & (df.price < 1e5)
idx_odo = (df.odometer >= 100) & (df.odometer < 3e5)

rows_to_train = idx_prc & idx_odo

In [14]:
df_train = df_pre.loc[rows_to_train, cols_to_train].copy()
df_train.shape

(374526, 18)

In [15]:
df_train.head()

Unnamed: 0,price,region,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,checked_VIN,posting_year,posting_day
27,33590,auburn,2014.0,gmc,good,8 cylinders,gas,57923.0,clean,other,,,pickup,white,al,1,2021.0,124.0
28,22590,auburn,2010.0,chevrolet,good,8 cylinders,gas,71229.0,clean,other,,,pickup,blue,al,1,2021.0,124.0
29,39590,auburn,2020.0,chevrolet,good,8 cylinders,gas,19160.0,clean,other,,,pickup,red,al,1,2021.0,124.0
30,30990,auburn,2017.0,toyota,good,8 cylinders,gas,41124.0,clean,other,,,pickup,red,al,1,2021.0,124.0
31,15000,auburn,2013.0,ford,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,full-size,truck,black,al,0,2021.0,123.0


# Upload data to s3

In [20]:
def write_csv_to_s3(section_name, bucket_name, file_path, data):
    
    parser = configparser.ConfigParser()
    parser.read('./config.ini')
    access_key = parser.get(section_name, "access_key")
    secret_key = parser.get(section_name, "secret_key")
    
    s3 = boto3.client('s3',
                      aws_access_key_id=access_key, 
                      aws_secret_access_key=secret_key)

    print("Put object")
    s3.put_object(
        Body=data.to_csv(index=False).encode(),
        Bucket=bucket_name, Key=file_path)

    print("Completed")

In [21]:
section_name = "aws_boto_credentials"
bucket_name = "s3-test-ev"
file_path = "train/220616_used_car_v1.csv"

write_csv_to_s3(section_name, bucket_name, file_path, df_train)

Put object
Completed
