In [1]:
import numpy as np
import pandas as pd
import wandb
import pprint
import helper
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Preparing dataset
wandb.init(name='load_tweet_dataset_1',
                 project='epfl_ml_project2', 
                 tags=['load_dataset'],
                 job_type='for_testing',
                 anonymous='allow')
artifact = wandb.use_artifact('hsunyu/epfl_ml_project2/twitter_dataset_1:v0')
artifact_dir = artifact.download()
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcr7_reunited[0m ([33mcr7_reunited-cr7[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   3 of 3 files downloaded.  


In [3]:
wandb.login()

True

In [4]:
# Load it into a pandas dataframe
train_df = pd.read_json(artifact_dir + '/train.json', lines=True)
val_df = pd.read_json(artifact_dir + '/val.json', lines=True)
test_df = pd.read_json(artifact_dir + '/test.json', lines=True)

In [5]:
# Initializing hyperparameter grid
sweep_config = {
    'method': 'grid'
}

metric = {
    'name': 'val_accuracy',
    'goal': 'maximize'
}

parameters_dict = {
    'dataset': {
        'values': ['bow', 'tfidf']
    },
    'num_features': {
        'values': [100, 500, 1000, 5000, 10000, 20000, 50000, 74552]
    },
    'regularizer_strength': {
        'values': [0.01, 0.1, 1, 10, 100]
    }
}

sweep_config['metric'] = metric
sweep_config['parameters'] = parameters_dict

In [6]:
pprint.pprint(sweep_config)

{'method': 'grid',
 'metric': {'goal': 'maximize', 'name': 'val_accuracy'},
 'parameters': {'dataset': {'values': ['bow', 'tfidf']},
                'num_features': {'values': [100,
                                            500,
                                            1000,
                                            5000,
                                            10000,
                                            20000,
                                            50000,
                                            74552]},
                'regularizer_strength': {'values': [0.01, 0.1, 1, 10, 100]}}}


In [7]:
sweep_id = wandb.sweep(sweep_config, project='epfl_ml_project2')

Create sweep with ID: ebsgyfi0
Sweep URL: https://wandb.ai/cr7_reunited-cr7/epfl_ml_project2/sweeps/ebsgyfi0


In [8]:
# Preprocessing dataset, `datasets` is a dictionary of keys 'bow', 'tfidf' with values containing their respective train, validation, and test datasets
datasets = helper.prepare_datasets(train_df, val_df, test_df)

In [9]:
# Getting optimal Random Forest's hyperparameters
best_RF_params = helper.get_optimal_RF_params()

[34m[1mwandb[0m: Sorting runs by -summary_metrics.val_accuracy


In [10]:
# Training optimal random forest
X_train, _, X_test, y_train, _, y_test = datasets[best_RF_params['dataset']]
clf = RandomForestClassifier(n_estimators=best_RF_params['n_estimators'], criterion=best_RF_params['criterion'], max_depth=best_RF_params['max_depth'], min_samples_leaf=best_RF_params['min_samples_leaf'], n_jobs=-1, random_state=599131614)
clf.fit(X_train, y_train)

In [11]:
# Generating feature importances from optimal random forest
importances = clf.feature_importances_
print(importances.shape)
indices = np.argsort(importances)[::-1]

(74552,)


In [12]:
def train(config=None):
    """
    Boilerplate for training and evaluating model from hyperparameter grids using wandb.
    Boilerplate for each model differes a little bit, hence needs to be manually configured for each model.
    """
    with wandb.init(config=config):
        config = wandb.config
        X_train, X_val,_ , y_train, y_val, _ = helper.get_features(*datasets[config.dataset], config.num_features, indices)
        clf = LogisticRegression(C=config.regularizer_strength, random_state=599131614, max_iter=1000)
        clf.fit(X_train, y_train)
        wandb.log({'train_accuracy': accuracy_score(y_train, clf.predict(X_train)), 'val_accuracy': accuracy_score(y_val, clf.predict(X_val))})

In [13]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: 3kxo9gif with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 100
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.72286
val_accuracy,0.71293


[34m[1mwandb[0m: Agent Starting Run: fqxqmpz8 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 100
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.72508
val_accuracy,0.71415


[34m[1mwandb[0m: Agent Starting Run: 0mp2aqkt with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 100
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.72514
val_accuracy,0.7144


[34m[1mwandb[0m: Agent Starting Run: u3edeqzr with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 100
[34m[1mwandb[0m: 	regularizer_strength: 10


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.72539
val_accuracy,0.7146


[34m[1mwandb[0m: Agent Starting Run: 8ucnrkdn with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 100
[34m[1mwandb[0m: 	regularizer_strength: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.72538
val_accuracy,0.71452


[34m[1mwandb[0m: Agent Starting Run: iiq6m3sn with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 500
[34m[1mwandb[0m: 	regularizer_strength: 0.01


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.76326
val_accuracy,0.7544


[34m[1mwandb[0m: Agent Starting Run: h6n8jcv3 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 500
[34m[1mwandb[0m: 	regularizer_strength: 0.1


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77277
val_accuracy,0.76472


[34m[1mwandb[0m: Agent Starting Run: qdeo47hy with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 500
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77396
val_accuracy,0.7657


[34m[1mwandb[0m: Agent Starting Run: dzlmtiru with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 500
[34m[1mwandb[0m: 	regularizer_strength: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77428
val_accuracy,0.76562


[34m[1mwandb[0m: Agent Starting Run: bcklkjg9 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 500
[34m[1mwandb[0m: 	regularizer_strength: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77433
val_accuracy,0.76567


[34m[1mwandb[0m: Agent Starting Run: 1111fngv with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 1000
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77272
val_accuracy,0.76395


[34m[1mwandb[0m: Agent Starting Run: z0574bzb with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 1000
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.78878
val_accuracy,0.77925


[34m[1mwandb[0m: Agent Starting Run: 4va8nyow with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 1000
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.79204
val_accuracy,0.78058


[34m[1mwandb[0m: Agent Starting Run: ttuan3ts with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 1000
[34m[1mwandb[0m: 	regularizer_strength: 10


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.79252
val_accuracy,0.7811


[34m[1mwandb[0m: Agent Starting Run: sf7q96f6 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 1000
[34m[1mwandb[0m: 	regularizer_strength: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.79253
val_accuracy,0.78117


[34m[1mwandb[0m: Agent Starting Run: bxuf1evt with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 5000
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.78197
val_accuracy,0.77075


[34m[1mwandb[0m: Agent Starting Run: catxwer8 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 5000
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.8129
val_accuracy,0.79517


[34m[1mwandb[0m: Agent Starting Run: c2k1poui with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 5000
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.82567
val_accuracy,0.79778


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: sit327q2 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 5000
[34m[1mwandb[0m: 	regularizer_strength: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.82744
val_accuracy,0.7963


[34m[1mwandb[0m: Agent Starting Run: iqzkq6do with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 5000
[34m[1mwandb[0m: 	regularizer_strength: 100


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.82781
val_accuracy,0.7956


[34m[1mwandb[0m: Agent Starting Run: u4o4baw3 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 10000
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.78285
val_accuracy,0.77112


[34m[1mwandb[0m: Agent Starting Run: i82gkfor with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 10000
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.81893
val_accuracy,0.79632


[34m[1mwandb[0m: Agent Starting Run: 18u4733l with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 10000
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.8421
val_accuracy,0.7991


[34m[1mwandb[0m: Agent Starting Run: uny014la with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 10000
[34m[1mwandb[0m: 	regularizer_strength: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.84972
val_accuracy,0.7932


[34m[1mwandb[0m: Agent Starting Run: xn8wco40 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 10000
[34m[1mwandb[0m: 	regularizer_strength: 100


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.84954
val_accuracy,0.79087


[34m[1mwandb[0m: Agent Starting Run: tesa9u3n with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 20000
[34m[1mwandb[0m: 	regularizer_strength: 0.01


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.78356
val_accuracy,0.7713


[34m[1mwandb[0m: Agent Starting Run: 1i54znqc with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 20000
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.82268
val_accuracy,0.79722


[34m[1mwandb[0m: Agent Starting Run: 4oqfm2q6 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 20000
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.85802
val_accuracy,0.79985


[34m[1mwandb[0m: Agent Starting Run: 0vjp3thi with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 20000
[34m[1mwandb[0m: 	regularizer_strength: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.8817
val_accuracy,0.78958


[34m[1mwandb[0m: Agent Starting Run: cqn01o0h with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 20000
[34m[1mwandb[0m: 	regularizer_strength: 100


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.88375
val_accuracy,0.78207


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9j70ky3y with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 50000
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.78418
val_accuracy,0.7714


[34m[1mwandb[0m: Agent Starting Run: 956n7ulq with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 50000
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.82668
val_accuracy,0.7981


[34m[1mwandb[0m: Agent Starting Run: am73ptwj with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 50000
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.87527
val_accuracy,0.80092


[34m[1mwandb[0m: Agent Starting Run: qcz4ga9q with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 50000
[34m[1mwandb[0m: 	regularizer_strength: 10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.91381
val_accuracy,0.78995


[34m[1mwandb[0m: Agent Starting Run: a6j3vwd8 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 50000
[34m[1mwandb[0m: 	regularizer_strength: 100


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.9202
val_accuracy,0.7756


[34m[1mwandb[0m: Agent Starting Run: ubgj3ag9 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 74552
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.78427
val_accuracy,0.77145


[34m[1mwandb[0m: Agent Starting Run: szi014ye with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 74552
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.82699
val_accuracy,0.79815


[34m[1mwandb[0m: Agent Starting Run: v8qwyr1e with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 74552
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.87672
val_accuracy,0.80155


[34m[1mwandb[0m: Agent Starting Run: 31mp9la6 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 74552
[34m[1mwandb[0m: 	regularizer_strength: 10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.91528
val_accuracy,0.79012


[34m[1mwandb[0m: Agent Starting Run: 595mjr8q with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	num_features: 74552
[34m[1mwandb[0m: 	regularizer_strength: 100


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.92253
val_accuracy,0.77135


[34m[1mwandb[0m: Agent Starting Run: 163j7ecz with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 100
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.68925
val_accuracy,0.6859


[34m[1mwandb[0m: Agent Starting Run: s9mlyq0x with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 100
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.71058
val_accuracy,0.70608


[34m[1mwandb[0m: Agent Starting Run: 0e1ux70c with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 100
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.71735
val_accuracy,0.71132


[34m[1mwandb[0m: Agent Starting Run: akx9x6ar with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 100
[34m[1mwandb[0m: 	regularizer_strength: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.71903
val_accuracy,0.7121


[34m[1mwandb[0m: Agent Starting Run: 6phniaj4 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 100
[34m[1mwandb[0m: 	regularizer_strength: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.7193
val_accuracy,0.7121


[34m[1mwandb[0m: Agent Starting Run: tp861j0y with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 500
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.71783
val_accuracy,0.71467


[34m[1mwandb[0m: Agent Starting Run: rn1x7h0h with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 500
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.75889
val_accuracy,0.75428


[34m[1mwandb[0m: Agent Starting Run: iwye462n with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 500
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77128
val_accuracy,0.76405


[34m[1mwandb[0m: Agent Starting Run: l2ap2ni0 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 500
[34m[1mwandb[0m: 	regularizer_strength: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77292
val_accuracy,0.7659


[34m[1mwandb[0m: Agent Starting Run: a328wt8e with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 500
[34m[1mwandb[0m: 	regularizer_strength: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77292
val_accuracy,0.76535


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0z6aqwo9 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 1000
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.72347
val_accuracy,0.71937


[34m[1mwandb[0m: Agent Starting Run: 1hcapcg3 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 1000
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77143
val_accuracy,0.76387


[34m[1mwandb[0m: Agent Starting Run: ba6xcz3y with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 1000
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.79045
val_accuracy,0.7799


[34m[1mwandb[0m: Agent Starting Run: iv10yj4n with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 1000
[34m[1mwandb[0m: 	regularizer_strength: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.79262
val_accuracy,0.78005


[34m[1mwandb[0m: Agent Starting Run: icda2s1c with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 1000
[34m[1mwandb[0m: 	regularizer_strength: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.79272
val_accuracy,0.77997


[34m[1mwandb[0m: Agent Starting Run: 4cdkonue with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 5000
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.7281
val_accuracy,0.72298


[34m[1mwandb[0m: Agent Starting Run: zyjzoeek with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 5000
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.78652
val_accuracy,0.77492


[34m[1mwandb[0m: Agent Starting Run: 8pg58vlt with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 5000
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.81977
val_accuracy,0.79815


[34m[1mwandb[0m: Agent Starting Run: cv8wzv68 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 5000
[34m[1mwandb[0m: 	regularizer_strength: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.82693
val_accuracy,0.79775


[34m[1mwandb[0m: Agent Starting Run: 6v65lbwj with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 5000
[34m[1mwandb[0m: 	regularizer_strength: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.82765
val_accuracy,0.79615


[34m[1mwandb[0m: Agent Starting Run: hspr0z6p with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 10000
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.72869
val_accuracy,0.72313


[34m[1mwandb[0m: Agent Starting Run: m9qt07ks with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 10000
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.78928
val_accuracy,0.77635


[34m[1mwandb[0m: Agent Starting Run: tv69il34 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 10000
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.82978
val_accuracy,0.80022


[34m[1mwandb[0m: Agent Starting Run: hrokc9gz with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 10000
[34m[1mwandb[0m: 	regularizer_strength: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.84718
val_accuracy,0.79822


[34m[1mwandb[0m: Agent Starting Run: ghhfday7 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 10000
[34m[1mwandb[0m: 	regularizer_strength: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.85004
val_accuracy,0.79255


[34m[1mwandb[0m: Agent Starting Run: iuq5zj71 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 20000
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.72908
val_accuracy,0.72315


[34m[1mwandb[0m: Agent Starting Run: 60cu9cb6 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 20000
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.79111
val_accuracy,0.77653


[34m[1mwandb[0m: Agent Starting Run: oeg0yzu1 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 20000
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.83824
val_accuracy,0.80188


[34m[1mwandb[0m: Agent Starting Run: y8dl1jx3 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 20000
[34m[1mwandb[0m: 	regularizer_strength: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.87196
val_accuracy,0.79783


[34m[1mwandb[0m: Agent Starting Run: o8d7188j with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 20000
[34m[1mwandb[0m: 	regularizer_strength: 100


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.88305
val_accuracy,0.78687


[34m[1mwandb[0m: Agent Starting Run: 64726uce with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 50000
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.72943
val_accuracy,0.72313


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 8076sq46 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 50000
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.79294
val_accuracy,0.77672


[34m[1mwandb[0m: Agent Starting Run: v64sgnq1 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 50000
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.84667
val_accuracy,0.80273


[34m[1mwandb[0m: Agent Starting Run: gksh1nra with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 50000
[34m[1mwandb[0m: 	regularizer_strength: 10


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.89779
val_accuracy,0.79895


[34m[1mwandb[0m: Agent Starting Run: zxttxhzh with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 50000
[34m[1mwandb[0m: 	regularizer_strength: 100


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.91781
val_accuracy,0.78253


[34m[1mwandb[0m: Agent Starting Run: ar5sbtlw with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 74552
[34m[1mwandb[0m: 	regularizer_strength: 0.01


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.72954
val_accuracy,0.72328


[34m[1mwandb[0m: Agent Starting Run: 84aoi0no with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 74552
[34m[1mwandb[0m: 	regularizer_strength: 0.1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.79312
val_accuracy,0.77692


[34m[1mwandb[0m: Agent Starting Run: y7scdzei with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 74552
[34m[1mwandb[0m: 	regularizer_strength: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.84728
val_accuracy,0.80315


[34m[1mwandb[0m: Agent Starting Run: 1von7rzc with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 74552
[34m[1mwandb[0m: 	regularizer_strength: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.89959
val_accuracy,0.79945


[34m[1mwandb[0m: Agent Starting Run: 38aocwj2 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	num_features: 74552
[34m[1mwandb[0m: 	regularizer_strength: 100


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.92053
val_accuracy,0.78143


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [16]:
def train_test(config):
    """
    Boilerplate for training and testing optimal model.
    Boilerplate for each model differes a little bit, hence need to be manually configured for each model.
    Params:
    config: a dictionary containing optimal hyperparameters
    """
    X_train, X_val, X_test, y_train, y_val, y_test = helper.get_features(
        *datasets[config['dataset']],
        config['num_features'],
        indices
    )
    clf = LogisticRegression(C=config['regularizer_strength'], random_state=599131614, max_iter=1000)
    clf.fit(X_train, y_train)
    print(f'train accuracy: {accuracy_score(y_train, clf.predict(X_train))}')
    print(f'val accuracy: {accuracy_score(y_val, clf.predict(X_val))}')
    print(f'test accuracy: {accuracy_score(y_test, clf.predict(X_test))}')
    print(f'test f1-score: {f1_score(y_test, clf.predict(X_test))}')

In [17]:
# Training and testing optimal model for TF-IDF dataset
conditions = {
    'dataset': 'tfidf'
}

best_params = helper.get_best_run_by_conditions(sweep_id, conditions)
print(best_params)
train_test(best_params)

{'regularizer_strength': 1, 'dataset': 'tfidf', 'num_features': 74552}
train accuracy: 0.847275
val accuracy: 0.80315
test accuracy: 0.806825
test f1-score: 0.8086665841277703


In [18]:
# Training and testing optimal model for BoW dataset
conditions = {
    'dataset': 'bow'
}

best_params = helper.get_best_run_by_conditions(sweep_id, conditions)
print(best_params)
train_test(best_params)

{'dataset': 'bow', 'num_features': 74552, 'regularizer_strength': 1}
train accuracy: 0.8767166666666667
val accuracy: 0.80155
test accuracy: 0.80655
test f1-score: 0.8116083166966939
