# Data Transformations

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import StratifiedKFold

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

imputer = SimpleImputer()
standardizer = StandardScaler()

In [None]:
from imblearn.over_sampling import SMOTE

#  Balancing the classes using SMOTE
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(x_train, y_train)

In [None]:
# sklearn pipelines
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

num_pipe = make_pipeline(SimpleImputer(), MinMaxScaler())
cat_pipe = make_pipeline(SimpleImputer("most_frequent"), 
                         OneHotEncoder())

processor = make_column_transformer(
    (num_pipe, ["num_feats"]), 
    (cat_pipe, ["cat_feats"])
)

log_pipe= make_pipeline(processor, LogisticRegression())

In [None]:
# To HTML display pipeline 
from sklearn import set_config
set_config(display = 'diagram')

# ...define pipeline...

pipeline

## Categorical Encoding

In [None]:
ohe = OneHotEncoder()
ohe.fit(df)
X_test = ohe.transform(df_test)

### Ordinal Encoder for e.g. good, bad, worst

In [None]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
diamonds.loc[:, cats] = oe.fit_transform(diamonds[cats])  # cats is a list of the names of all categorical columns that need to be ordinally encoded

## Date Transformations

In [None]:
dte = 

## String columns

In [None]:
# get latent topic categories from strings

GapEncoder

MinHashEncoder -> fast, stateless (for distributed computing), not interpretable

In [None]:
# Drop duplicates
df = df.drop_duplicates()

# Drop column
df = df.drop(columns="column_name")

# Remove non-alphabetic text from column
df['text_col'] = df['text_col'].str.replace('[^a-zA-Z]','')

#  Replace nulls
df = df.fillna('')

df = df.dropna(subset="name")

df = df.reset_index(drop=True)

### Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

df['x'] = ss.fit_transform(df[['x']])
df['x'].hist()

### Log transform

In [None]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer()
df['price'] = pt.fit_transform(df[['price']])
df['price'].hist()

### Data Imputations

- KNNImputer - classic KNN but for missing data. Data points are imputed by averaging the value of its n-neighbors or by taking their mode if categorical.

- IterativeImputer - accepts any model as an estimater and models missing values as a function existing features. In other words, training data becomes all the rows without missing data and the test set is all those that are.

In [None]:
from sklearn.impute import KNNImputer

# Copy the data
knn_imputed = diabetes.copy(deep=True)
# Init the transformer
imp = KNNImputer(n_neighbors=3)
# Fit/transform
knn_imputed.loc[:, :] = imp.fit_transform(knn_imputed)

#  or ...

#  Init
imp = IterativeImputer(
    estimator=ExtraTreesRegressor(), 
    max_iter=10
)

# Tranform
ii_imputed.loc[:, :] = imp.fit_transform(ii_imputed)

#### Checking the effectiveness of a data imputation technique by plotting KDE plots

The closer they are, the more similar the imputed points are to the real distribution

In [None]:
# Plot the original distribution
sns.kdeplot(diabetes.SkinThickness, label="Original Distribution")

for k in n_neighbors:
    knn_imp = KNNImputer(k)
    diabetes.loc[:, :] = knn_imp.fit_transform(diabetes)
    # Plot again once imputed    
    sns.kdeplot(diabetes.SkinThickness, label=f"Imputed Dist with k={k}")


## Quick dimensionality Reduction

In [None]:
import umap
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


pca = PCA(n_components=2)
tsne = TSNE(n_components=2)
manifold = umap.UMAP(n_components=2)

X_transformed = pca.fit_transform(X)
X_transformed = tsne.fit_transform(X)
X_transformed = manifold.fit_transform(X)

# Modelling

## Trees

In [None]:
from sklearn import datasets, tree
from sklearn.tree import DecisionTreeClassifier


clf = DecisionTreeClassifier()
_ = clf.fit(X, y)

# Printing prediction path
text = tree.export_text(clf)
print(text)

# Visualizing tree 
fig = plt.figure(figsize=(15, 15))
_ = tree.plot_tree(
    clf, feature_names=iris.feature_names, 
    class_names=iris.target_names, filled=True
)

# Feature Importance

# Explainability

# Evaluation

In [None]:
from sklearn import metrics


>>> metrics.get_scorer("roc_auc")
make_scorer(roc_auc_score, needs_threshold=True)

>>> metrics.get_scorer("precision")
make_scorer(precision_score, average=binary)

>>> metrics.get_scorer("r2")
make_scorer(r2_score)