# AutoML Experiments on Clean and Noisy Data

## AutoKeras

### Text Classification on IMDB dataset

#### Clean Data

In [62]:
import numpy as np
import pandas as pd

In [63]:
import autokeras as ak

In [64]:
imdb_df = pd.read_csv("../datasets/clean/imdb.csv")
imdb_df['sentiment'] = imdb_df['sentiment'].apply(lambda x: 1 if x =='positive' else 0)

In [65]:
x_train = np.array(imdb_df.loc[:25000, 'review'])
y_train = np.array(imdb_df.loc[:25000, 'sentiment'])
x_test = np.array(imdb_df.loc[25001:, 'review'])
y_test = np.array(imdb_df.loc[25001:, 'sentiment'])

In [66]:
# Initialize the text classifier.
clf = ak.TextClassifier(overwrite=True, max_trials=3)

In [67]:
# Feed the text classifier with training data.
clf.fit(x_train, y_train, epochs=2)
# Predict with the best model.
predicted_y = clf.predict(x_test)

Trial 3 Complete [00h 00m 01s]

Best val_loss So Far: 0.3015161454677582
Total elapsed time: 00h 01m 12s




Epoch 1/2
Epoch 2/2
INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets


INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets




In [68]:
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))

[0.2813020646572113, 0.8854354023933411]


#### Noisy Data

In [69]:
imdb_df = pd.read_csv("../datasets/noisy/imdb_noisy.csv")
imdb_df['sentiment'] = imdb_df['sentiment'].apply(lambda x: 1 if x =='positive' else 0)

In [70]:
x_train = np.array(imdb_df.loc[:25000, 'review'])
y_train = np.array(imdb_df.loc[:25000, 'sentiment'])
x_test = np.array(imdb_df.loc[25001:, 'review'])
y_test = np.array(imdb_df.loc[25001:, 'sentiment'])

In [71]:
# Initialize the text classifier.
clf = ak.TextClassifier(overwrite=True, max_trials=3)

In [72]:
# Feed the text classifier with training data.
clf.fit(x_train, y_train, epochs=2)
# Predict with the best model.
predicted_y = clf.predict(x_test)

Trial 3 Complete [00h 00m 00s]

Best val_loss So Far: 0.42938950657844543
Total elapsed time: 00h 01m 11s




Epoch 1/2
Epoch 2/2
INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets


INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets




In [73]:
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))

[0.3968028724193573, 0.8225529193878174]


## TPOT

### Text Classification on IRIS dataset

#### Clean Data

In [7]:
from tpot import TPOTClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from tpot.export_utils import set_param_recursive

In [9]:
# NOTE: Make sure that the outcome column is labeled 'target' in the data file
iris_df = pd.read_csv("../datasets/clean/iris.csv")
X_train, X_test, y_train, y_test = train_test_split(iris_df.loc[:, iris_df.columns != 'target'], 
                                                    iris_df.loc[:, 'target'], 
                                                    train_size=0.75, test_size=0.25, random_state=42)

In [10]:
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 1.0

Generation 2 - Current best internal CV score: 1.0

Generation 3 - Current best internal CV score: 1.0

Generation 4 - Current best internal CV score: 1.0

Generation 5 - Current best internal CV score: 1.0

Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=3, p=1, weights=distance)
1.0


In [11]:
tpot.export('best_model_pipeline.py')

In [14]:
# Average CV score on the training set was: 0.9826086956521738
exported_pipeline = make_pipeline(
    Normalizer(norm="l2"),
    KNeighborsClassifier(n_neighbors=3, p=1, weights="distance")
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test)


In [15]:
exported_pipeline.score(X_test, y_test)

0.9736842105263158