In [1]:
import numpy as np
import pandas as pd
import wandb
import pprint
import helper
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Preparing dataset
wandb.init(name='load_tweet_dataset_1',
                 project='epfl_ml_project2', 
                 tags=['load_dataset'],
                 job_type='for_testing',
                 anonymous='allow')
artifact = wandb.use_artifact('hsunyu/epfl_ml_project2/twitter_dataset_1:v0')
artifact_dir = artifact.download()
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcr7_reunited[0m ([33mcr7_reunited-cr7[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   3 of 3 files downloaded.  


In [3]:
wandb.login()

True

In [4]:
# Load it into a pandas dataframe
train_df = pd.read_json(artifact_dir + '/train.json', lines=True)
val_df = pd.read_json(artifact_dir + '/val.json', lines=True)
test_df = pd.read_json(artifact_dir + '/test.json', lines=True)

In [5]:
# Initializing hyperparameter grid
sweep_config = {
    'method': 'grid'
}

metric = {
    'name': 'val_accuracy',
    'goal': 'maximize'
}

parameters_dict = {
    'dataset': {
        'values': ['bow', 'tfidf']
    },
    'num_features': {
        'values': [100, 500, 1000, 5000, 10000, 20000, 50000, 74552]
    },
    'model': {
        'values': ['Multinomial NB', 'Complement NB']
    }
}

sweep_config['metric'] = metric
sweep_config['parameters'] = parameters_dict

models = {
    'Multinomial NB': MultinomialNB,
    'Complement NB': ComplementNB
}

In [6]:
pprint.pprint(sweep_config)

{'method': 'grid',
 'metric': {'goal': 'maximize', 'name': 'val_accuracy'},
 'parameters': {'dataset': {'values': ['bow', 'tfidf']},
                'model': {'values': ['Multinomial NB', 'Complement NB']},
                'num_features': {'values': [100,
                                            500,
                                            1000,
                                            5000,
                                            10000,
                                            20000,
                                            50000,
                                            74552]}}}


In [7]:
sweep_id = wandb.sweep(sweep_config, project='epfl_ml_project2')

Create sweep with ID: 273xocwo
Sweep URL: https://wandb.ai/cr7_reunited-cr7/epfl_ml_project2/sweeps/273xocwo


In [8]:
# Preprocessing dataset, `datasets` is a dictionary of keys 'bow', 'tfidf' with values containing their respective train, validation, and test datasets
datasets = helper.prepare_datasets(train_df, val_df, test_df)

In [9]:
# Getting optimal Random Forest's hyperparameters
best_RF_params = helper.get_optimal_RF_params()

[34m[1mwandb[0m: Sorting runs by -summary_metrics.val_accuracy


In [10]:
# Training optimal random forest
X_train, _, X_test, y_train, _, y_test = datasets[best_RF_params['dataset']]
clf = RandomForestClassifier(n_estimators=best_RF_params['n_estimators'], criterion=best_RF_params['criterion'], max_depth=best_RF_params['max_depth'], min_samples_leaf=best_RF_params['min_samples_leaf'], n_jobs=-1, random_state=599131614)
clf.fit(X_train, y_train)

In [11]:
# Generating feature importances from optimal random forest
importances = clf.feature_importances_
print(importances.shape)
indices = np.argsort(importances)[::-1]

(74552,)


In [12]:
def train(config=None):
    """
    Boilerplate for training and evaluating model from hyperparameter grids using wandb.
    Boilerplate for each model differes a little bit, hence needs to be manually configured for each model.
    """
    with wandb.init(config=config):
        config = wandb.config
        X_train, X_val,_ , y_train, y_val, _ = helper.get_features(*datasets[config.dataset], config.num_features, indices)
        clf = models[config.model]()
        clf.fit(X_train, y_train)
        wandb.log({'train_accuracy': accuracy_score(y_train, clf.predict(X_train)), 'val_accuracy': accuracy_score(y_val, clf.predict(X_val))})

In [13]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: nvml8dsi with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.70016
val_accuracy,0.68945


[34m[1mwandb[0m: Agent Starting Run: nr1x2sq4 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 500


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.74103
val_accuracy,0.7328


[34m[1mwandb[0m: Agent Starting Run: iycq56ai with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 1000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.74958
val_accuracy,0.73882


[34m[1mwandb[0m: Agent Starting Run: n4701489 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 5000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.75656
val_accuracy,0.74055


[34m[1mwandb[0m: Agent Starting Run: 02w8jdjo with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 10000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.76089
val_accuracy,0.73923


[34m[1mwandb[0m: Agent Starting Run: 9fapqv0n with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 20000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.76687
val_accuracy,0.737


[34m[1mwandb[0m: Agent Starting Run: j96hzjs4 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 50000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77757
val_accuracy,0.7345


[34m[1mwandb[0m: Agent Starting Run: bgd9be1z with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 74552


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77197
val_accuracy,0.72942


[34m[1mwandb[0m: Agent Starting Run: olpsx9va with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.69973
val_accuracy,0.68837


[34m[1mwandb[0m: Agent Starting Run: p9mf5dh1 with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 500


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.7409
val_accuracy,0.73303


[34m[1mwandb[0m: Agent Starting Run: rfp1yh7x with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 1000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.74971
val_accuracy,0.73872


[34m[1mwandb[0m: Agent Starting Run: egl6h3hu with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 5000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.75668
val_accuracy,0.74065


[34m[1mwandb[0m: Agent Starting Run: 5f8i2haa with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 10000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.76104
val_accuracy,0.73957


[34m[1mwandb[0m: Agent Starting Run: 80wqovbb with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 20000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.76705
val_accuracy,0.73695


[34m[1mwandb[0m: Agent Starting Run: 78qoluth with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 50000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77782
val_accuracy,0.73472


[34m[1mwandb[0m: Agent Starting Run: en3givyx with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 74552


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.7722
val_accuracy,0.72952


[34m[1mwandb[0m: Agent Starting Run: fuluwdlj with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.7037
val_accuracy,0.69552


[34m[1mwandb[0m: Agent Starting Run: e7nb3jf0 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 500


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.75766
val_accuracy,0.74867


[34m[1mwandb[0m: Agent Starting Run: d93kwag9 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 1000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77246
val_accuracy,0.7601


[34m[1mwandb[0m: Agent Starting Run: 9xpnvpdz with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 5000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.7874
val_accuracy,0.76653


[34m[1mwandb[0m: Agent Starting Run: 00li3wpa with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 10000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.79627
val_accuracy,0.76582


[34m[1mwandb[0m: Agent Starting Run: 83el5b9n with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 20000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.80636
val_accuracy,0.76372


[34m[1mwandb[0m: Agent Starting Run: k7vayr22 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 50000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.82058
val_accuracy,0.7622


[34m[1mwandb[0m: Agent Starting Run: 7k7ai7ie with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Multinomial NB
[34m[1mwandb[0m: 	num_features: 74552


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.81671
val_accuracy,0.75958


[34m[1mwandb[0m: Agent Starting Run: jsetmuxh with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.70056
val_accuracy,0.69268


[34m[1mwandb[0m: Agent Starting Run: ulnub9we with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 500


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.75697
val_accuracy,0.74733


[34m[1mwandb[0m: Agent Starting Run: 960zzlnk with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 1000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77193
val_accuracy,0.7596


[34m[1mwandb[0m: Agent Starting Run: 3wpzfgxt with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 5000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.78749
val_accuracy,0.76695


[34m[1mwandb[0m: Agent Starting Run: 9u1m6mhh with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 10000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.79627
val_accuracy,0.76663


[34m[1mwandb[0m: Agent Starting Run: xe9m1fsb with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 20000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.8069
val_accuracy,0.76372


[34m[1mwandb[0m: Agent Starting Run: vkektymm with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 50000


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.82124
val_accuracy,0.76258


[34m[1mwandb[0m: Agent Starting Run: 8ps7ei92 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Complement NB
[34m[1mwandb[0m: 	num_features: 74552


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.81737
val_accuracy,0.7603


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [14]:
def train_test(config):
    """
    Boilerplate for training and testing optimal model.
    Boilerplate for each model differes a little bit, hence need to be manually configured for each model.
    Params:
    config: a dictionary containing optimal hyperparameters
    """
    X_train, X_val, X_test, y_train, y_val, y_test = helper.get_features(
        *datasets[config['dataset']],
        config['num_features'],
        indices
    )
    clf = models[config['model']]()
    clf.fit(X_train, y_train)
    print(f'train accuracy: {accuracy_score(y_train, clf.predict(X_train))}')
    print(f'val accuracy: {accuracy_score(y_val, clf.predict(X_val))}')
    print(f'test accuracy: {accuracy_score(y_test, clf.predict(X_test))}')
    print(f'test f1-score: {f1_score(y_test, clf.predict(X_test))}')

In [15]:
# Training and testing optimal Complement NB for TF-IDF dataset
conditions = {
    'dataset': 'tfidf',
    'model': 'Complement NB'
}

best_params = helper.get_best_run_by_conditions(sweep_id, conditions)
print(best_params)
train_test(best_params)

{'dataset': 'tfidf', 'num_features': 5000, 'model': 'Complement NB'}
train accuracy: 0.7874916666666667
val accuracy: 0.76695
test accuracy: 0.769725
test f1-score: 0.7809668751337598


In [16]:
# Training and testing optimal Multinomial NB for TF-IDF dataset
conditions = {
    'dataset': 'tfidf',
    'model': 'Multinomial NB'
}

best_params = helper.get_best_run_by_conditions(sweep_id, conditions)
print(best_params)
train_test(best_params)

{'dataset': 'tfidf', 'num_features': 5000, 'model': 'Multinomial NB'}
train accuracy: 0.7874
val accuracy: 0.766525
test accuracy: 0.76915
test f1-score: 0.7812470387567516


In [17]:
# Training and testing optimal Complement NB for BoW dataset
conditions = {
    'dataset': 'bow',
    'model': 'Complement NB'
}

best_params = helper.get_best_run_by_conditions(sweep_id, conditions)
print(best_params)
train_test(best_params)

{'dataset': 'bow', 'num_features': 5000, 'model': 'Complement NB'}
train accuracy: 0.7566833333333334
val accuracy: 0.74065
test accuracy: 0.74075
test f1-score: 0.7683406308640872


In [18]:
# Training and testing optimal Multinomial NB for BoW dataset
conditions = {
    'dataset': 'bow',
    'model': 'Multinomial NB'
}

best_params = helper.get_best_run_by_conditions(sweep_id, conditions)
print(best_params)
train_test(best_params)

{'model': 'Multinomial NB', 'dataset': 'bow', 'num_features': 5000}
train accuracy: 0.7565583333333333
val accuracy: 0.74055
test accuracy: 0.74085
test f1-score: 0.7686367288634942
