#  Data Preprocessing in Python

## Importing the Libraries

In [96]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

##  Importing the dataset

In [97]:
data = pd.read_csv("Data.csv")

## Transforming to Matrix

* binomial category -> only two output -> Binary logistic regression.
* indipendent variable -> feature

In [98]:
#iloc -> locate indexes
#iloc[how many rows you want, how many column you want]
#data.iloc[:, 0:3] → returns a pandas DataFrame
#.values → converts them into NumPy arrays
X = data.iloc[:,0:-1].values
y = data.iloc[:,-1].values

In [99]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Taking care of missing data

### Identifying missing data

In [100]:
missing_data = data.isnull().sum()
print(f"missing data: \n{missing_data}")

missing data: 
Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


In [101]:
# SimpleImputer -> this is called class in python language
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])


In [102]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Transforming categorical features


### Encoding the independent variable

In [103]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [104]:
ct = ColumnTransformer(transformers = [("encoder", OneHotEncoder(), [0])] , remainder= "passthrough")

In [105]:
X = np.array(ct.fit_transform(X))

In [106]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### Encoding the dependant variable

Dicotomas(only yes, no) variable trasform easy. Because 0 and 1 not ordinal based.

In [107]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

## Spliting the dataset in to the training set and test set

In [108]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2 , random_state = 42)

## Feature Scaling

Two kind of feature scaling:
1. Standardization
   - X= {x-mean(col) / sd(col)}
2. Normalization

In [109]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [110]:
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])

In [118]:
x_test[:, 3:] = sc.transform(x_test[:, 3:])

# Creating X and y for an unusual type of dataset


### Importing the necessary libraries

In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

### Loading the dataset

In [113]:
data = pd.read_csv("titanic.csv")

### Identifying the categorical data

In independent variable we always need to do OneHotEncoding even if they are dicotomos

In [114]:
categorical_features = ["Pclass", "Sex", "Embarked"]

### Encoding the categorical features

In [115]:
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), categorical_features)], remainder= 'passthrough')
dataset_tf = ct.fit_transform(data)


### Creating a matrix of the features

In [116]:
X = dataset_tf[:, :-1]

### Creating a dependent variable vector with the encoding of the variable


In [117]:
le = LabelEncoder()
y = le.fit_transform(data["Survived"])
# y = le.fit_transform(dataset_tf[:,-1])