# Pipeline

In [30]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


class Pipeline:
    def __init__(self):
        pass

    def _impute(self):
        raise NotImplementedError()

    def _scale(self):
        raise NotImplementedError()

    def _classify(self):
        raise NotImplementedError()

    def _split_data(self):
        raise NotImplementedError()

    def process(self):
        raise NotImplementedError()


class FIREMAN_Pipeline(Pipeline):
    def __init__(
        self,
        dataset_x,
        dataset_y,
        imputer="Simple",
        scaler="RandomScaler",
        classifier="RandomForest",
        scorer="report",
    ):
        self.dataset_x = dataset_x
        self.dataset_y = dataset_y
        self.imputer = imputer
        self.scaler = scaler
        self.classifier = classifier
        self.scorer = scorer

    def _impute(self):
        if self.imputer == "GAIN":
            pass

        elif self.imputer == "Simple":
            imputer = SimpleImputer()
            imputed_x = imputer.fit_transform(self.dataset_x)

        elif self.imputer == "":
            imputed_x = self.dataset_x

        else:
            raise NotImplementedError()

        return imputed_x

    def _scale(self, x):
        if self.scaler == "RandomScaler":
            scaler = StandardScaler()
            scaler.fit(x)
            return scaler.transform(x)

        elif self.scaler == "":
            return x

        else:
            raise NotImplementedError()

    def _split_data(self, x):
        if self.scorer == "cv_score":
            return x, self.dataset_y

        elif self.scorer == "report":
            x_train, x_test, y_train, y_test = train_test_split(x, self.dataset_y, test_size=0.1)
            return x_train, x_test, y_train, y_test

    def _classify(self):
        if self.classifier == "RandomForest":
            self.classifier = RandomForestClassifier()
        else:
            raise NotImplementedError()

    def process(self):
        x_missing = self._impute()
        x_scaled = self._scale(x_missing)
        if self.scorer == "report":
            x_train, x_test, y_train, y_test = self._split_data(x_scaled)
            self._classify()
            self.classifier.fit(x_train, y_train)
            y_predicted = self.classifier.predict(x_test)
            return print(classification_report(y_test, y_predicted))

        elif self.scorer == "cv_score":
            x, y = self._split_data(dataset_x_scaled)
            _classify()
            return print(cross_val_score(self.classifier, x, y, cv=10, scoring="f1_weighted"))

        else:
            raise NotImplementedError()

In [31]:
tep_dataset = pd.read_csv(
    "Tennessee_Event-Driven/datasets/tep_extended_dataset_simrun1.csv.csv",
    index_col=False,
)

dataset_X = tep_dataset.drop(columns=["faultNumber", "simulationRun", "sample"]).values
dataset_Y = tep_dataset["faultNumber"].values

no, dim = dataset_X.shape
p = 0.1
# Introduce missing data
mask = binary_sampler(1 - p, no, dim)
dataset_X_missing = dataset_X.copy()
dataset_X_missing[mask == 0] = np.nan

In [32]:
tep_pipeline = FIREMAN_Pipeline(dataset_X_missing, dataset_Y)

In [33]:
tep_pipeline.process()

              precision    recall  f1-score   support

           0       0.32      0.60      0.42       161
           1       1.00      0.89      0.94       151
           2       1.00      0.89      0.94       124
           3       0.33      0.46      0.38       142
           4       0.84      0.75      0.79       141
           5       0.72      0.76      0.74       147
           6       1.00      0.88      0.93       130
           7       1.00      0.86      0.92       149
           8       0.99      0.93      0.96       145
           9       0.36      0.45      0.40       154
          10       0.77      0.69      0.73       147
          11       0.80      0.61      0.69       150
          12       0.91      0.82      0.86       141
          13       1.00      0.83      0.91       156
          14       0.97      0.79      0.87       166
          15       0.30      0.44      0.35       142
          16       0.76      0.65      0.70       160
          17       0.82    

### example of raising NotImplementedError

In [35]:
tep_pipeline = FIREMAN_Pipeline(dataset_X_missing, dataset_Y, scaler="Scaler")
tep_pipeline.process()

NotImplementedError: 