In [90]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import tensorflow as tf

In [91]:
df = pd.read_csv("train.csv")
df = df.drop(["ID", "Levy", "Model", "Manufacturer", "Category"], axis=1)
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(axis=0)

In [92]:
label_encoder = LabelEncoder()
minmax_scaler = MinMaxScaler()

In [93]:
df

Unnamed: 0,Price,Prod. year,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,13328,2010,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,16621,2011,No,Petrol,3,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,8467,2006,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,3607,2011,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,11726,2014,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,8467,1999,Yes,CNG,2.0 Turbo,300000 km,4.0,Manual,Rear,02-Mar,Left wheel,Silver,5
19233,15681,2011,Yes,Petrol,2.4,161600 km,4.0,Tiptronic,Front,04-May,Left wheel,Red,8
19234,26108,2010,Yes,Diesel,2,116365 km,4.0,Automatic,Front,04-May,Left wheel,Grey,4
19235,5331,2007,Yes,Diesel,2,51258 km,4.0,Automatic,Front,04-May,Left wheel,Black,4


In [94]:
def process_airbags_cylinders(x):
    if (x>=0 and x<=4):
        return 0
    elif (x>4 and x<=8):
        return 1
    elif (x>8 and x<=12):
        return 2
    elif (x>12 and x<=16):
        return 3
df["Airbags"] = df["Airbags"].apply(process_airbags_cylinders)
df["Cylinders"] = df["Cylinders"].apply(process_airbags_cylinders)

In [95]:
df["Color"] = label_encoder.fit_transform(df["Color"])

In [96]:
df["Wheel"] = label_encoder.fit_transform(df["Wheel"])

In [97]:
df["Doors"] = label_encoder.fit_transform(df["Doors"])

In [98]:
df["Drive wheels"] = label_encoder.fit_transform(df["Drive wheels"])

In [99]:
df["Gear box type"] = label_encoder.fit_transform(df["Gear box type"])

In [100]:
df["Fuel type"] = label_encoder.fit_transform(df["Fuel type"])

In [101]:
df["Leather interior"] = label_encoder.fit_transform(df["Leather interior"])

In [102]:
def process_milage(x):
    x = float("".join([i for i in x if i.isdigit() or i == "."]))
    x = np.log10(x)
    return x

df["Mileage"] = df["Mileage"].apply(process_milage)

In [103]:
def process_turbo(x):
    if "Turbo" in x:
        return 1
    else:
        return 0
df["Turbo"] = df["Engine volume"].apply(process_turbo)

In [104]:
def process_engine_volume(x):
    x = float("".join([i for i in x if i.isdigit() or i == "."]))
    if x <= 3:
        return 0
    else:
        return 1
df["Engine volume"] = df["Engine volume"].apply(process_engine_volume)

In [105]:
def process_prodyear(x):
    if x <= 1950:
        return 0
    elif 1951 < x <= 1975:
        return 1
    elif 1976 < x <= 2000:
        return 2
    elif 2001 < x <= 2010:        
        return 3
    elif 2011 < x <= 2015:
        return 4
    else:
        return 5

df["Prod. year"] = df["Prod. year"].apply(process_prodyear)

In [106]:
def process_price(x):
    return np.log10(x)

df["Price"] = df["Price"].apply(process_price)

In [107]:
# for col in ["Model", "Manufacturer", "Category"]:
#     x = label_encoder.fit_transform(df[col])
#     x = minmax_scaler.fit_transform(x.reshape(-1, 1))
#     df[col] = x

In [108]:
df

Unnamed: 0,Price,Prod. year,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags,Turbo
0,4.124765,3,1,2,1,5.269525,1,0,0,1,0,12,2,0
1,4.220657,5,0,5,0,5.283301,1,2,0,1,0,1,1,0
2,3.927730,3,0,5,0,5.301030,0,3,1,1,1,1,0,0
3,3.557146,5,1,2,0,5.227799,0,0,0,1,0,14,0,0
4,4.069150,4,1,5,0,4.963320,0,0,1,1,0,12,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,3.927730,2,1,0,0,5.477121,0,1,2,0,0,12,1,1
19233,4.195374,5,1,5,0,5.208441,0,2,1,1,0,11,1,0
19234,4.416774,3,1,1,0,5.065822,0,0,1,1,0,7,0,0
19235,3.726809,3,1,1,0,4.709762,0,0,1,1,0,1,0,0


In [109]:
# for col in df.columns:
#     if col in ["Price", "Mileage", "Model", "Manufacturer", "Category"]:
#         pass
#     else:
#         df[col] = minmax_scaler.fit_transform(np.array(df[col]).reshape(-1, 1))

In [110]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(axis=0)

In [111]:
x = np.array(df.drop("Price", axis=1))
y = np.array(df["Price"]).reshape(-1, 1)

In [112]:
def create_batches_x(array, batch_size):
    q, mod = divmod(len(array), batch_size)
    array = array[:-mod]
    array = array.reshape((q, batch_size, 13))
    return array

def create_batches_y(array, batch_size):
    q, mod = divmod(len(array), batch_size)
    array = array[:-mod]
    array = array.reshape((q, batch_size, 1))
    return array
x, y = create_batches_x(x, 32), create_batches_y(y, 32)

In [113]:
x.shape, y.shape

((578, 32, 13), (578, 32, 1))

In [114]:
np.save("x.npy", x)
np.save("y.npy", x)

In [115]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
# x_train, x_test, y_train, y_test = torch.Tensor(x_train), torch.Tensor(x_test), torch.Tensor(y_train), torch.Tensor(y_test)

In [116]:
x_train.shape, y_train.shape

((462, 32, 13), (462, 32, 1))

In [117]:
count = np.isinf(x_train).sum()
print(count)

0


In [118]:
for col in df.columns:
    x = np.array(df[col])
    print(np.isinf(x).sum(), col)

0 Price
0 Prod. year
0 Leather interior
0 Fuel type
0 Engine volume
0 Mileage
0 Cylinders
0 Gear box type
0 Drive wheels
0 Doors
0 Wheel
0 Color
0 Airbags
0 Turbo
