### Practice Exercise (Data Preprocessing)

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split

In [16]:
#loading the dataset

df = pd.read_csv('car_insurance.csv')
print(f'Shape of the Dataset:\n {df.shape}\n')
print(f'Dataset:\n {df.head(1)}\n')

print(f'Missing Values:\n {df.isnull().sum()}\n')

#feature and target variable
X = df.iloc[:,1:-1]
X = X.drop(['MaritalStatus','Education'],axis=1).values

y = df[['ClaimAmount']].values

#handling the missing values
int_col = [0,3,4,7,8,9,10]
str_col = [1,2,5,6,11]

impute_mean = SimpleImputer(missing_values=np.nan,strategy='mean')
X[:,int_col] = impute_mean.fit_transform(X[:,int_col])

y = impute_mean.fit_transform(y)

impute_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X[:,str_col] = impute_frequent.fit_transform(X[:,str_col])


Shape of the Dataset:
 (25, 16)

Dataset:
   PolicyID   Age Gender  ... DrivingExperience Region ClaimAmount
0     P001  35.0   Male  ...              10.0  Urban      2500.0

[1 rows x 16 columns]

Missing Values:
 PolicyID             0
Age                  4
Gender               3
MaritalStatus        0
Education            0
Occupation           0
Income               2
VehicleAge           3
VehicleType          0
FuelType             5
EngineSize           1
Mileage              1
PreviousClaims       1
DrivingExperience    2
Region               0
ClaimAmount          3
dtype: int64



In [None]:
#Encoding the categorial col
ct = ColumnTransformer(transformers=[('Encode',OneHotEncoder(),str_col)],remainder='passthrough')

X = np.array(ct.fit_transform(X))


In [18]:
#splitting the training and test set
X_train, X_test ,y_train,  y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [19]:
#Feature Scaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)

In [20]:
#printing the final training and test sets
print(f'Scaled X_train set:\n {X_train} \n')
print(f'Scaled X_test set:\n {X_test}\n')
print(f'y_train set:\n {y_train}')

Scaled X_train set:
 [[-0.73379939  0.73379939 -0.22941573 -0.22941573 -0.22941573 -0.22941573
  -0.22941573 -0.22941573  0.         -0.22941573 -0.22941573 -0.33333333
  -0.22941573  0.         -0.33333333  0.          0.         -0.22941573
  -0.22941573  4.35889894 -0.22941573 -0.22941573 -0.22941573 -0.22941573
  -0.81649658 -0.57735027  1.52752523 -0.22941573  1.36277029 -1.36277029
  -0.65465367  1.52752523 -0.81649658  0.50139312 -0.51823102  0.08429617
  -0.30195986  0.00997132  0.04019339  0.67252914]
 [-0.73379939  0.73379939 -0.22941573 -0.22941573 -0.22941573 -0.22941573
  -0.22941573 -0.22941573  0.         -0.22941573 -0.22941573 -0.33333333
  -0.22941573  0.         -0.33333333  0.          0.         -0.22941573
  -0.22941573 -0.22941573 -0.22941573 -0.22941573  4.35889894 -0.22941573
   1.22474487 -0.57735027 -0.65465367 -0.22941573 -0.73379939  0.73379939
  -0.65465367 -0.65465367  1.22474487  0.15273366 -1.07169832 -0.53331931
  -0.99215381 -0.62819314 -0.76367448 -0