#  Data Preprocessing in Python

## Importing the Libraries

In [59]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

##  Importing the dataset

In [60]:
data = pd.read_csv("Data.csv")

## Transforming to Matrix

* binomial category -> only two output -> Binary logistic regression.
* indipendent variable -> feature

In [61]:
#iloc -> locate indexes
#iloc[how many rows you want, how many column you want]
#data.iloc[:, 0:3] → returns a pandas DataFrame
#.values → converts them into NumPy arrays
X = data.iloc[:,0:-1].values
y = data.iloc[:,-1].values

In [62]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Taking care of missing data

### Identifying missing data

In [76]:
missing_data = data.isnull().sum()
print(f"missing data: \n{missing_data}")

missing data: 
Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


In [64]:
# SimpleImputer -> this is called class in python language
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])


In [65]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Transforming categorical features


### Encoding the independent variable

In [66]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [67]:
ct = ColumnTransformer(transformers = [("encoder", OneHotEncoder(), [0])] , remainder= "passthrough")

In [68]:
X = np.array(ct.fit_transform(X))

In [69]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### Encoding the dependant variable

Dicotomas(only yes, no) variable trasform easy. Because 0 and 1 not ordinal based.

In [70]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)