In [1]:
import pandas as pd
import wandb
import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
wandb.init(name='load_tweet_dataset_1',
                 project='epfl_ml_project2', 
                 tags=['load_dataset'],
                 job_type='for_testing')
artifact = wandb.use_artifact('hsunyu/epfl_ml_project2/twitter_dataset_1:v0')
artifact_dir = artifact.download()
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcr7_reunited[0m ([33mcr7_reunited-cr7[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   3 of 3 files downloaded.  


In [3]:
wandb.login()

True

In [4]:
# load it into a pandas dataframe
train_df = pd.read_json(artifact_dir + '/train.json', lines=True)
val_df = pd.read_json(artifact_dir + '/val.json', lines=True)
test_df = pd.read_json(artifact_dir + '/test.json', lines=True)

In [5]:
sweep_config = {
    'method': 'grid'
}

metric = {
    'name': 'val_accuracy',
    'goal': 'maximize'
}

parameters_dict = {
    'dataset': {
        'values': ['bow', 'tfidf']
    },
    'n_estimators': {
        'values': [1, 10, 50, 100, 250, 500]
    },
    'criterion': {
        'values': ['gini', 'entropy', 'log_loss']
    },
    'max_depth': {
        'values': [10, 20, 30, 50, 100]
    },
    'min_samples_leaf': {
        'values': [1, 2, 4, 10]
    }
}

sweep_config['metric'] = metric
sweep_config['parameters'] = parameters_dict

In [6]:
pprint.pprint(sweep_config)

{'method': 'grid',
 'metric': {'goal': 'maximize', 'name': 'val_accuracy'},
 'parameters': {'criterion': {'values': ['gini', 'entropy', 'log_loss']},
                'dataset': {'values': ['bow', 'tfidf']},
                'max_depth': {'values': [10, 20, 30, 50, 100]},
                'min_samples_leaf': {'values': [1, 2, 4, 10]},
                'n_estimators': {'values': [1, 10, 50, 100, 250, 500]}}}


In [7]:
sweep_id = wandb.sweep(sweep_config, project='epfl_ml_project2')

Create sweep with ID: gy700h45
Sweep URL: https://wandb.ai/cr7_reunited-cr7/epfl_ml_project2/sweeps/gy700h45


In [8]:
bow = CountVectorizer()
tfidf = TfidfVectorizer()

In [9]:
train_sentences = train_df['sentence1'].tolist()
val_sentences = val_df['sentence1'].tolist()

In [10]:
y_train = train_df['label'].values
y_val = val_df['label'].values

In [11]:
X_train_bow = bow.fit_transform(train_sentences)
X_val_bow = bow.transform(val_sentences)

In [12]:
X_train_tfidf = tfidf.fit_transform(train_sentences)
X_val_tfidf = tfidf.transform(val_sentences)

In [13]:
def return_dataset(dataset):
    if dataset == 'bow':
        return X_train_bow, X_val_bow, y_train, y_val
    return X_train_tfidf, X_val_tfidf, y_train, y_val

In [14]:
def train(config=None):
    with wandb.init(config=config):
        config = wandb.config
        X_train, X_val, y_train, y_val = return_dataset(config.dataset)
        clf = RandomForestClassifier(n_estimators=config.n_estimators, criterion=config.criterion, max_depth=config.max_depth, n_jobs=-1, random_state=599131614)
        clf.fit(X_train, y_train)
        wandb.log({'train_accuracy': accuracy_score(y_train, clf.predict(X_train)), 'val_accuracy': accuracy_score(y_val, clf.predict(X_val))})

In [15]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: rf48ho0y with config:
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	n_estimators: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.52558
val_accuracy,0.5271


[34m[1mwandb[0m: Agent Starting Run: rk59l2ov with config:
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	n_estimators: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.62989
val_accuracy,0.6254


[34m[1mwandb[0m: Agent Starting Run: rcxhr4uy with config:
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	n_estimators: 50


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.71464
val_accuracy,0.70305


[34m[1mwandb[0m: Agent Starting Run: q8d3b581 with config:
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	n_estimators: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.71988
val_accuracy,0.7095


[34m[1mwandb[0m: Agent Starting Run: rcfnvfuv with config:
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	n_estimators: 250


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.72405
val_accuracy,0.71305


[34m[1mwandb[0m: Agent Starting Run: d49tslba with config:
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	n_estimators: 500


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.72742
val_accuracy,0.71522


[34m[1mwandb[0m: Agent Starting Run: v6ucbog6 with config:
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	n_estimators: 1


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.52558
val_accuracy,0.5271


[34m[1mwandb[0m: Agent Starting Run: mcipmyr2 with config:
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	n_estimators: 10


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.62989
val_accuracy,0.6254


[34m[1mwandb[0m: Agent Starting Run: v17kfow1 with config:
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	n_estimators: 50


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.71464
val_accuracy,0.70305


[34m[1mwandb[0m: Agent Starting Run: ooqgg0jj with config:
[34m[1mwandb[0m: 	criterion: gini
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	n_estimators: 100


0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.71988
val_accuracy,0.7095
