In [1]:
import pandas as pd
import wandb
import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score

In [2]:
wandb.init(name='load_tweet_dataset_1',
                 project='epfl_ml_project2', 
                 tags=['load_dataset'],
                 job_type='for_testing')
artifact = wandb.use_artifact('hsunyu/epfl_ml_project2/twitter_dataset_1:v0')
artifact_dir = artifact.download()
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcr7_reunited[0m ([33mcr7_reunited-cr7[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   3 of 3 files downloaded.  


In [3]:
wandb.login()

True

In [4]:
# load it into a pandas dataframe
train_df = pd.read_json(artifact_dir + '/train.json', lines=True)
val_df = pd.read_json(artifact_dir + '/val.json', lines=True)
test_df = pd.read_json(artifact_dir + '/test.json', lines=True)

In [5]:
sweep_config = {
    'method': 'grid'
}

metric = {
    'name': 'val_accuracy',
    'goal': 'maximize'
}

parameters_dict = {
    'dataset': {
        'values': ['bow', 'tfidf']
    },
    'model': {
        'values': ['Multinomial NB', 'Complement NB']
    }
}

sweep_config['metric'] = metric
sweep_config['parameters'] = parameters_dict

models = {
    'Multinomial NB': MultinomialNB,
    'Complement NB': ComplementNB
}

In [6]:
pprint.pprint(sweep_config)

{'method': 'grid',
 'metric': {'goal': 'maximize', 'name': 'val_accuracy'},
 'parameters': {'dataset': {'values': ['bow', 'tfidf']},
                'model': {'values': ['Multinomial NB', 'Complement NB']}}}


In [7]:
sweep_id = wandb.sweep(sweep_config, project='epfl_ml_project2')

Create sweep with ID: x22u24sn
Sweep URL: https://wandb.ai/cr7_reunited-cr7/epfl_ml_project2/sweeps/x22u24sn


In [8]:
bow = CountVectorizer()
tfidf = TfidfVectorizer()

In [9]:
train_sentences = train_df['sentence1'].tolist()
val_sentences = val_df['sentence1'].tolist()
test_sentences = test_df['sentence1'].tolist()

In [10]:
y_train = train_df['label'].values
y_val = val_df['label'].values
y_test = test_df['label'].values

In [11]:
X_train_bow = bow.fit_transform(train_sentences)
X_val_bow = bow.transform(val_sentences)
X_test_bow = bow.transform(test_sentences)

In [12]:
X_train_tfidf = tfidf.fit_transform(train_sentences)
X_val_tfidf = tfidf.transform(val_sentences)
X_test_tfidf = tfidf.transform(test_sentences)

In [13]:
def return_dataset(dataset):
    if dataset == 'bow':
        return X_train_bow, X_val_bow, X_test_bow, y_train, y_val, y_test
    return X_train_tfidf, X_val_tfidf, X_test_tfidf, y_train, y_val, y_test

In [14]:
def train(config=None):
    with wandb.init(config=config):
        config = wandb.config
        X_train, X_val, _, y_train, y_val, _ = return_dataset(config.dataset)
        clf = models[config.model]()
        clf.fit(X_train, y_train)
        wandb.log({'train_accuracy': accuracy_score(y_train, clf.predict(X_train)), 'val_accuracy': accuracy_score(y_val, clf.predict(X_val))})

In [15]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: 6v5liqcu with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Multinomial NB


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.77197
val_accuracy,0.72942


[34m[1mwandb[0m: Agent Starting Run: iez9dg0b with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	model: Complement NB


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.7722
val_accuracy,0.72952


[34m[1mwandb[0m: Agent Starting Run: jiqr3iir with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Multinomial NB


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.81671
val_accuracy,0.75958


[34m[1mwandb[0m: Agent Starting Run: m9tiquc3 with config:
[34m[1mwandb[0m: 	dataset: tfidf
[34m[1mwandb[0m: 	model: Complement NB


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.81737
val_accuracy,0.7603


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [16]:
api = wandb.Api()
sweep = api.sweep(f'cr7_reunited-cr7/epfl_ml_project2/sweeps/{sweep_id}')

# Get best run parameters
best_run = sweep.best_run(order='val_accuracy')
best_parameters = best_run.config
print(best_parameters)

[34m[1mwandb[0m: Sorting runs by -summary_metrics.val_accuracy


{'dataset': 'tfidf', 'model': 'Complement NB'}
