# Posttest 4 Data-Preprocessing

## Import Library

In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Data Splitting

In [43]:
df_split = pd.read_csv("datasets_rusak.csv")

In [44]:
X = df_split.iloc[:,:-1]
y = df_split.iloc[:,-1]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [46]:
print("Data Split")
print("===========================")
print("Dimensi X_train:", X_train.shape)
print("Dimensi X_test:", X_test.shape)
print("Dimensi y_train:", y_train.shape)
print("Dimensi y_test:", y_test.shape)

Data Split
Dimensi X_train: (1627, 10)
Dimensi X_test: (698, 10)
Dimensi y_train: (1627,)
Dimensi y_test: (698,)


### Data Transforming

### Normalisasi

#### Import modul Normalisasi

In [47]:
from sklearn.preprocessing import MinMaxScaler

In [48]:
df_to_normal = df_split.copy()
df_to_normal.head()

Unnamed: 0.1,Unnamed: 0,price,mark,model,year,mileage,engine_capacity,transmission,drive,hand_drive,fuel
0,0,80.0,nissan,march,2003,80000,1240,at,2wd,rhd,gasoline
1,1,110.0,nissan,march,2010,53000,1200,at,2wd,rhd,gasoline
2,2,165.0,nissan,lafesta,2005,47690,2000,at,2wd,rhd,gasoline
3,3,,toyota,avensis,2008,130661,1990,at,2wd,rhd,gasoline
4,4,,daihatsu,mira,2006,66300,660,at,2wd,rhd,gasoline


In [49]:
min_max_scaler = MinMaxScaler()

In [50]:
X_norm = min_max_scaler.fit_transform(
        df_to_normal[['mileage']]
)

In [51]:
df_normal = pd.DataFrame(X_norm)

In [52]:
df_normal.head()

Unnamed: 0,0
0,0.098985
1,0.064721
2,0.057982
3,0.163275
4,0.081599


### Standarisasi

#### Import modul Standarisasi

In [53]:
from sklearn.preprocessing import StandardScaler

In [54]:
df_to_standar = df_split.copy()
df_to_standar.head()

Unnamed: 0.1,Unnamed: 0,price,mark,model,year,mileage,engine_capacity,transmission,drive,hand_drive,fuel
0,0,80.0,nissan,march,2003,80000,1240,at,2wd,rhd,gasoline
1,1,110.0,nissan,march,2010,53000,1200,at,2wd,rhd,gasoline
2,2,165.0,nissan,lafesta,2005,47690,2000,at,2wd,rhd,gasoline
3,3,,toyota,avensis,2008,130661,1990,at,2wd,rhd,gasoline
4,4,,daihatsu,mira,2006,66300,660,at,2wd,rhd,gasoline


In [55]:
standar_scaler = StandardScaler()

In [56]:
print("Sebelum di standarisasi")
print("Standar Deviasi :"+str(df_to_standar.std()))

Sebelum di standarisasi
Standar Deviasi :Unnamed: 0           676.220523
price                288.250772
year                   3.708029
mileage            52502.081591
engine_capacity      550.716359
dtype: float64


  print("Standar Deviasi :"+str(df_to_standar.std()))


In [57]:
X_standar = standar_scaler.fit_transform(
    df_to_standar[['price','year','mileage','engine_capacity']])

In [58]:
df_standar = pd.DataFrame(X_standar)

In [59]:
print("Setelah distandarisasi")
print(df_standar.head())
print("Standar Deviasi :", "\n"+str(df_standar.std()))

Setelah distandarisasi
          0         1         2         3
0 -3.093558 -0.806677 -0.378459 -0.480315
1 -2.989460  1.081525 -0.892835 -0.552964
2 -2.798612 -0.267191 -0.993996  0.900002
3       NaN  0.542039  0.586681  0.881840
4       NaN  0.002552 -0.639457 -1.533716
Standar Deviasi : 
0    1.000216
1    1.000215
2    1.000215
3    1.000215
dtype: float64


## Data Cleaning

In [60]:
df_rusak = pd.read_csv('datasets_rusak.csv')
df_rusak.isna().sum()

Unnamed: 0         0
price              6
mark               0
model              0
year               0
mileage            0
engine_capacity    0
transmission       0
drive              0
hand_drive         0
fuel               0
dtype: int64

In [61]:
df_rusak.duplicated().sum()

7

### Mengsubstitusi NULL dengan nilai Modus

In [62]:
from sklearn.impute import SimpleImputer

In [63]:
imputer = SimpleImputer(strategy='most_frequent')

In [64]:
df_rusak['price'] = imputer.fit_transform(df_rusak[['price']])

In [65]:
df_rusak.isna().sum()

Unnamed: 0         0
price              0
mark               0
model              0
year               0
mileage            0
engine_capacity    0
transmission       0
drive              0
hand_drive         0
fuel               0
dtype: int64

### Menghapus Duplikat

In [66]:
df_rusak[df_rusak.duplicated()]

Unnamed: 0.1,Unnamed: 0,price,mark,model,year,mileage,engine_capacity,transmission,drive,hand_drive,fuel
51,52,350.0,daihatsu,mira,2012,51000,660,at,2wd,rhd,gasoline
52,52,350.0,daihatsu,mira,2012,51000,660,at,2wd,rhd,gasoline
53,52,350.0,daihatsu,mira,2012,51000,660,at,2wd,rhd,gasoline
54,52,350.0,daihatsu,mira,2012,51000,660,at,2wd,rhd,gasoline
55,52,350.0,daihatsu,mira,2012,51000,660,at,2wd,rhd,gasoline
56,52,350.0,daihatsu,mira,2012,51000,660,at,2wd,rhd,gasoline
57,52,350.0,daihatsu,mira,2012,51000,660,at,2wd,rhd,gasoline


In [67]:
df_rusak.drop_duplicates(inplace=True)

In [68]:
df_rusak[df_rusak.duplicated()]

Unnamed: 0.1,Unnamed: 0,price,mark,model,year,mileage,engine_capacity,transmission,drive,hand_drive,fuel


## Mengganti tipe data

In [69]:
df_rusak.dtypes

Unnamed: 0           int64
price              float64
mark                object
model               object
year                 int64
mileage              int64
engine_capacity      int64
transmission        object
drive               object
hand_drive          object
fuel                object
dtype: object

In [70]:
df_rusak['mileage'] = df_rusak['mileage'].astype(float)

In [71]:
df_rusak.dtypes

Unnamed: 0           int64
price              float64
mark                object
model               object
year                 int64
mileage            float64
engine_capacity      int64
transmission        object
drive               object
hand_drive          object
fuel                object
dtype: object

## One Hot Encoding

In [72]:
from sklearn.preprocessing import OneHotEncoder

In [73]:
encoder = OneHotEncoder(sparse=False)

In [74]:
encoded = encoder.fit_transform(df_rusak[['transmission']])

In [75]:
df_onehot = pd.DataFrame(encoded)

In [76]:
df_rusak = df_rusak.join(df_onehot)

In [77]:
df_rusak.head()

Unnamed: 0.1,Unnamed: 0,price,mark,model,year,mileage,engine_capacity,transmission,drive,hand_drive,fuel,0,1,2
0,0,80.0,nissan,march,2003,80000.0,1240,at,2wd,rhd,gasoline,1.0,0.0,0.0
1,1,110.0,nissan,march,2010,53000.0,1200,at,2wd,rhd,gasoline,1.0,0.0,0.0
2,2,165.0,nissan,lafesta,2005,47690.0,2000,at,2wd,rhd,gasoline,1.0,0.0,0.0
3,3,850.0,toyota,avensis,2008,130661.0,1990,at,2wd,rhd,gasoline,1.0,0.0,0.0
4,4,850.0,daihatsu,mira,2006,66300.0,660,at,2wd,rhd,gasoline,1.0,0.0,0.0
