In [None]:
from sklearn.random_projection import GaussianRandomProjection  # type: ignore
from sklearn.preprocessing import PolynomialFeatures  # type: ignore
from sklearn.ensemble import RandomForestClassifier  # type: ignore
from sklearn.feature_selection import RFECV  # type: ignore
from sklearn.decomposition import PCA  # type: ignore
from evaluator import ModelEvaluator
import sys, warnings

sys.path.append("../")
from datasets.setCreator import SetCreator
from datasets.setModifier import SetModifier

In [None]:
warnings.filterwarnings("ignore")

setModifier = SetModifier()
setCreator = SetCreator()
dataset1 = setCreator.getSetList1()
dataset2 = setCreator.getSetList2()

PCA

In [None]:
for currData in dataset1:
    pca = PCA()
    pca.fit_transform(currData["train"])

    print(f"[SUCCESS] identified {len(pca.components_)} relevant components")
    print(pca.components_)
    print(pca.explained_variance_ratio_)

Gaussian Projection

In [None]:
for currData in dataset2:
    currData["train"] = currData["train"].fillna(0)
    gaussian_rnd_proj = GaussianRandomProjection(random_state=0, n_components=25)
    X_reduced = gaussian_rnd_proj.fit_transform(currData["train"])

    print(X_reduced)

### From the book

```
for currData in dataset1:
    trainningData = currData["train"]
    trainY = trainningData["ergot_present_in_q4"]
    trainX = setModifier.rmErgotPredictors(trainningData)

    poly = dict()
    X_poly = dict()

    For n in [2, 3, 4, 5]:
        poly[n] = PolynomialFeatures(n)
        X_poly[n] = poly[n].fit_transform(trainX)

        model = RandomForestClassifier(n_estimators=100,max_depth=5, n_jobs=4, random_state=2)
        rfecv = RFECV(estimator=model, n_jobs=1) # apply feature elimination/cross-validation to model
        best_feat = rfecv.fit(X_poly[2], trainY)
        X_support = X_poly[2][:, best_feat.support_] # X_support now automatically holds the best subset

        print(X_support.shape)  # tells you the best dimensions to use
        print(X_support)
```

runs it exactly once

In [None]:
for currData in dataset1:
    trainningData = currData["train"]
    trainY = trainningData["ergot_present_in_q4"]
    trainX = setModifier.rmErgotPredictors(trainningData)

    try:
        model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
        rfecv = RFECV(
            estimator=model, n_jobs=1
        )  # apply feature elimination/cross-validation to model
        best_feat = rfecv.fit(trainX, trainY)
        X_support = trainX.loc[
            :, best_feat.support_
        ]  # X_support now automatically holds the best subset

        print("[SUCCESS]")
        print(X_support.shape)  # tells you the best dimensions to use

        for col in X_support.columns.tolist():
            print(col)
    except Exception as e:
        pass

Run it multiple times to see when it would stop

In [None]:
for currData in dataset2:
    trainningData = currData["train"]
    trainY = trainningData["ergot_present_in_q4"]
    trainX = setModifier.rmErgotFeatures(trainningData)

    reducable = True  # controls the loop, is true by default then is determined by
    # comparing the calculated set of features against the current set of features
    try:
        while reducable:
            model = RandomForestClassifier(
                n_estimators=100, max_depth=5, random_state=0
            )
            rfecv = RFECV(
                estimator=model, n_jobs=1
            )  # apply feature elimination/cross-validation to model
            best_feat = rfecv.fit(trainX, trainY)
            X_support = trainX.loc[
                :, best_feat.support_
            ]  # X_support now automatically holds the best subset

            # reduce the set to the subset proposed by the best features if we can
            if X_support.shape < trainX.shape:
                trainX = trainX[X_support.columns.tolist()]
            else:
                reducable = False

        print(f'[SUCCESS] reduced data in dataset: {currData["desc"]}')
        print(X_support.shape)  # tells you the best dimensions to use

        for col in X_support.columns.tolist():
            print(col)

        print()
    except Exception as e:
        pass