In [1]:
import pandas as pd
import wandb
import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [2]:
wandb.init(name='load_tweet_dataset_1',
                 project='epfl_ml_project2', 
                 tags=['load_dataset'],
                 job_type='for_testing')
artifact = wandb.use_artifact('hsunyu/epfl_ml_project2/twitter_dataset_1:v0')
artifact_dir = artifact.download()
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcr7_reunited[0m ([33mcr7_reunited-cr7[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   3 of 3 files downloaded.  


In [3]:
wandb.login()

True

In [4]:
# load it into a pandas dataframe
train_df = pd.read_json(artifact_dir + '/train.json', lines=True)
val_df = pd.read_json(artifact_dir + '/val.json', lines=True)
test_df = pd.read_json(artifact_dir + '/test.json', lines=True)

In [5]:
sweep_config = {
    'method': 'grid'
}

metric = {
    'name': 'val_accuracy',
    'goal': 'maximize'
}

parameters_dict = {
    'dataset': {
        'values': ['bow', 'tfidf']
    },
    'n_estimators': {
        'values': [10, 50, 100, 250, 500]
    },
    'learning_rate': {
        'values': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3]
    },
    'max_depth': {
        'values': [3, 6, 10, 15, None]
    },
    'reg_lambda': {
        'values': [0, 0.1, 1, 10]
    },
    'reg_alpha': {
        'values': [0, 0.1, 1, 10]
    },
    'gamma': {
        'values': [0, 0.1, 0.2, 0.5, 1]
    }
}

sweep_config['metric'] = metric
sweep_config['parameters'] = parameters_dict

In [6]:
pprint.pprint(sweep_config)

{'method': 'grid',
 'metric': {'goal': 'maximize', 'name': 'val_accuracy'},
 'parameters': {'dataset': {'values': ['bow', 'tfidf']},
                'gamma': {'values': [0, 0.1, 0.2, 0.5, 1]},
                'learning_rate': {'values': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3]},
                'max_depth': {'values': [3, 6, 10, 15, None]},
                'n_estimators': {'values': [10, 50, 100, 250, 500]},
                'reg_alpha': {'values': [0, 0.1, 1, 10]},
                'reg_lambda': {'values': [0, 0.1, 1, 10]}}}


In [7]:
sweep_id = wandb.sweep(sweep_config, project='epfl_ml_project2')

Create sweep with ID: 746qhm43
Sweep URL: https://wandb.ai/cr7_reunited-cr7/epfl_ml_project2/sweeps/746qhm43


In [8]:
bow = CountVectorizer()
tfidf = TfidfVectorizer()

In [9]:
train_sentences = train_df['sentence1'].tolist()
val_sentences = val_df['sentence1'].tolist()
test_sentences = test_df['sentence1'].tolist()

In [10]:
train_df.loc[train_df['label'] == -1, 'label'] = 0
val_df.loc[val_df['label'] == -1, 'label'] = 0
test_df.loc[test_df['label'] == -1, 'label'] = 0

In [11]:
y_train = train_df['label'].values
y_val = val_df['label'].values
y_test = test_df['label'].values

In [12]:
X_train_bow = bow.fit_transform(train_sentences)
X_val_bow = bow.transform(val_sentences)
X_test_bow = bow.transform(test_sentences)

In [13]:
X_train_tfidf = tfidf.fit_transform(train_sentences)
X_val_tfidf = tfidf.transform(val_sentences)
X_test_tfidf = tfidf.transform(test_sentences)

In [14]:
def return_dataset(dataset):
    if dataset == 'bow':
        return X_train_bow, X_val_bow, X_test_bow, y_train, y_val, y_test
    return X_train_tfidf, X_val_tfidf, X_test_tfidf, y_train, y_val, y_test

In [15]:
def train(config=None):
    with wandb.init(config=config):
        config = wandb.config
        X_train, X_val, _, y_train, y_val, _ = return_dataset(config.dataset)
        clf = xgb.XGBClassifier(n_estimators=config.n_estimators, learning_rate=config.learning_rate, max_depth=config.max_depth, reg_lambda=config.reg_lambda, reg_alpha=config.reg_alpha, gamma=config.gamma, random_state=599131614, n_jobs=-1, device='cuda')
        clf.fit(X_train, y_train)
        wandb.log({'train_accuracy': accuracy_score(y_train, clf.predict(X_train)), 'val_accuracy': accuracy_score(y_val, clf.predict(X_val))})

In [16]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: pr4zu6bj with config:
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	gamma: 0
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 10
[34m[1mwandb[0m: 	reg_alpha: 0
[34m[1mwandb[0m: 	reg_lambda: 0




0,1
train_accuracy,▁
val_accuracy,▁

0,1
train_accuracy,0.6486
val_accuracy,0.6437


In [17]:
api = wandb.Api()
sweep = api.sweep(f'cr7_reunited-cr7/epfl_ml_project2/sweeps/{sweep_id}')

# Get best run parameters
best_run = sweep.best_run(order='val_accuracy')
best_params = best_run.config
print(best_params)

[34m[1mwandb[0m: Sorting runs by -summary_metrics.val_accuracy


{'dataset': 'bow', 'max_depth': 3, 'reg_alpha': 0, 'reg_lambda': 0, 'n_estimators': 10, 'learning_rate': 0.001, 'gamma': 0}


In [18]:
X_train, _, X_test, y_train, _, y_test = return_dataset(best_params['dataset'])
clf = xgb.XGBClassifier(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], reg_lambda=best_params['reg_lambda'], reg_alpha=best_params['reg_alpha'], gamma=best_params['gamma'], random_state=599131614, n_jobs=-1, device='gpu')
clf.fit(X_train, y_train)
print(accuracy_score(y_train, clf.predict(X_train)))
print(accuracy_score(y_test, clf.predict(X_test)))



0.6486
0.6444
