In [72]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [82]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [76]:
from app.src.pandas_transform.pandas_col_transform import PandasColumnTransformer
from app.src.pandas_transform.impute_by_group import ImputeNumericalByGroup, ImputeCategoricalByGroup

In [2]:
%doctest_mode

Exception reporting mode: Plain
Doctest mode is: ON


Introductory example from here: [https://scikit-learn.org/stable/getting_started.html](https://scikit-learn.org/stable/getting_started.html)

In [3]:
clf = RandomForestClassifier(random_state=0)
X = [[ 1,  2,  3],  # 2 samples, 3 features
     [11, 12, 13]]
y = [0, 1]  # classes of each sample
clf.fit(X, y)
RandomForestClassifier(random_state=0)

The fit method generally accepts 2 inputs:
- The samples matrix (or design matrix) `X`. The size of `X` is typically (`n_samples`, `n_features`), which means that samples are represented as rows and features are represented as columns.
- The target values `y` which are real numbers for regression tasks, or integers for classification (or any other discrete set of values). `y` is usually 1d array where the `i`th entry corresponds to the target of the `i`th sample (row) of `X`.

In [4]:
X

[[1, 2, 3], [11, 12, 13]]

In [5]:
y

[0, 1]

In [6]:
clf.predict(X)  # predict classes of the training data

array([0, 1])

In [7]:
clf.predict([[4, 5, 6], [14, 15, 16]])  # predict classes of new data

array([0, 1])

In [8]:
X1 = [[0, 15], [1, -10]]

# scale data according to computed scaling values
StandardScaler().fit(X1).transform(X1)

array([[-1.,  1.],
       [ 1., -1.]])

In [9]:

# create a pipeline object
pipe = make_pipeline(StandardScaler(), LogisticRegression())

# load the iris dataset and split it into train and test sets
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# fit the whole pipeline
pipe.fit(X_train, y_train)

# Don't seem to need the following - legacy?
# Pipeline(steps=[('standardscaler', StandardScaler()),
#                 ('logisticregression', LogisticRegression())])

# we can now use it like any other estimator
accuracy_score(pipe.predict(X_test), y_test)

0.9736842105263158

In [11]:
X, y = load_iris(return_X_y=True)

clf = LogisticRegression(random_state=0, max_iter=1000).fit(X, y)
clf.predict(X[:2, :])
clf.predict_proba(X[:2, :])
clf.score(X, y)

0.9733333333333334

In [14]:
len(y)

150

Try for Titanic dataset

In [17]:
import pandas as pd

In [32]:
def load_data():
    data_url = "https://raw.githubusercontent.com/Mjboothaus/titanic/main/data"
    titanic_training = pd.read_csv(f"{data_url}/train.csv")
    titanic_test= pd.read_csv(f"{data_url}/test.csv")
    return titanic_training, titanic_test

In [49]:
titanic_training, titanic_test = load_data()

In [50]:
drop_fields = ["PassengerId", "Name", "Ticket", "Fare", "Cabin", "Embarked"]

# Should also drop "Embarked" as it should have nothing to do with passenger survival or otherwise
# Probably also "Cabin" -- assume there is a relationship with "Pclass"

In [51]:
titanic_training.drop(drop_fields, axis=1, inplace=True)

In [52]:
titanic_training.head()

   Survived  Pclass     Sex   Age  SibSp  Parch
0         0       3    male  22.0      1      0
1         1       1  female  38.0      1      0
2         1       3  female  26.0      0      0
3         1       1  female  35.0      1      0
4         0       3    male  35.0      0      0

In [53]:
X = titanic_training[["Pclass", "Sex", "Age", "SibSp", "Parch"]]

In [54]:
y = titanic_training[["Survived"]]

In [88]:
X.isna().sum()

Pclass      0
Sex         0
Age       177
SibSp       0
Parch       0
dtype: int64

In [94]:
pipe = make_pipeline([
    ImputeNumericalByGroup(target_col="Age", groupby_col=["Pclass"], return_df=True),
    OneHotEncoder()]).fit(X)  #,
    # LogisticRegression(random_state=0, max_iter=1000)).fit(X, y)

    #, ImputeCategoricalByGroup(target_col="Embarked", groupby_col=["Pclass"], return_df=True, copy=False)


TypeError: Last step of Pipeline should implement fit or be the string 'passthrough'. '[ImputeNumericalByGroup(target_col='Age', copy=True, groupby_col=['Pclass'], key_error_on_unseen=True, imputation_values=None, return_df=True), OneHotEncoder()]' (type <class 'list'>) doesn't

In [93]:
type(pipe)

<class 'sklearn.pipeline.Pipeline'>

In [86]:
X.head()

   Pclass  Sex   Age  SibSp  Parch
0       3    0  22.0      1      0
1       1    0  38.0      1      0
2       3    0  26.0      0      0
3       1    0  35.0      1      0
4       3    0  35.0      0      0

In [69]:
# X["Sex"] = X["Sex"].apply(lambda x: 1 if x == "male" else 0)

In [44]:
y.head()

   Survived
0         0
1         1
2         1
3         1
4         0

In [63]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X, y)
clf.predict(X)
clf.score(X, y)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values