In [1]:
import numpy as np
import pandas as pd
import wandb
import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
wandb.init(name='load_tweet_dataset_1',
                 project='epfl_ml_project2', 
                 tags=['load_dataset'],
                 job_type='for_testing')
artifact = wandb.use_artifact('hsunyu/epfl_ml_project2/twitter_dataset_1:v0')
artifact_dir = artifact.download()
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcr7_reunited[0m ([33mcr7_reunited-cr7[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   3 of 3 files downloaded.  


In [3]:
wandb.login()

True

In [4]:
# load it into a pandas dataframe
train_df = pd.read_json(artifact_dir + '/train.json', lines=True)
val_df = pd.read_json(artifact_dir + '/val.json', lines=True)
test_df = pd.read_json(artifact_dir + '/test.json', lines=True)

In [5]:
sweep_config = {
    'method': 'grid'
}

metric = {
    'name': 'val_accuracy',
    'goal': 'maximize'
}

parameters_dict = {
    'dataset': {
        'values': ['bow', 'tfidf']
    },
    'num_features': {
        'values': [100, 500, 1000, 5000, 10000, 20000, 50000, 74552]
    },
    'kernel': {'values': ['linear', 'poly', 'rbf', 'sigmoid']},
    'C': {'values': [0.01, 0.1, 1, 10, 100]},
    'gamma': {'values': ['scale', 'auto', 0.01, 0.1, 1]},
    'degree': {'values': [2, 3, 4, 5]},
}

sweep_config['metric'] = metric
sweep_config['parameters'] = parameters_dict

In [6]:
pprint.pprint(sweep_config)

{'method': 'grid',
 'metric': {'goal': 'maximize', 'name': 'val_accuracy'},
 'parameters': {'C': {'values': [0.01, 0.1, 1, 10, 100]},
                'dataset': {'values': ['bow', 'tfidf']},
                'degree': {'values': [2, 3, 4, 5]},
                'gamma': {'values': ['scale', 'auto', 0.01, 0.1, 1]},
                'kernel': {'values': ['linear', 'poly', 'rbf', 'sigmoid']},
                'num_features': {'values': [100,
                                            500,
                                            1000,
                                            5000,
                                            10000,
                                            20000,
                                            50000,
                                            74552]}}}


In [7]:
sweep_id = wandb.sweep(sweep_config, project='epfl_ml_project2')

Create sweep with ID: jcr6zftu
Sweep URL: https://wandb.ai/cr7_reunited-cr7/epfl_ml_project2/sweeps/jcr6zftu


In [8]:
bow = CountVectorizer()
tfidf = TfidfVectorizer()

In [9]:
train_sentences = train_df['sentence1'].tolist()
val_sentences = val_df['sentence1'].tolist()
test_sentences = test_df['sentence1'].tolist()

In [10]:
y_train = train_df['label'].values
y_val = val_df['label'].values
y_test = test_df['label'].values

In [11]:
X_train_bow = bow.fit_transform(train_sentences)
X_val_bow = bow.transform(val_sentences)
X_test_bow = bow.transform(test_sentences)

In [12]:
X_train_tfidf = tfidf.fit_transform(train_sentences)
X_val_tfidf = tfidf.transform(val_sentences)
X_test_tfidf = tfidf.transform(test_sentences)

In [13]:
def return_dataset(dataset):
    if dataset == 'bow':
        return X_train_bow, X_val_bow, X_test_bow, y_train, y_val, y_test
    return X_train_tfidf, X_val_tfidf, X_test_tfidf, y_train, y_val, y_test

In [14]:
api = wandb.Api()
sweep = api.sweep('cr7_reunited-cr7/epfl_ml_project2/sweeps/al89gxt6')

# Get best run parameters
best_run = sweep.best_run(order='val_accuracy')
best_params = best_run.config
print(best_params)

[34m[1mwandb[0m: Sorting runs by -summary_metrics.val_accuracy


{'n_estimators': 500, 'min_samples_leaf': 1, 'dataset': 'bow', 'criterion': 'log_loss', 'max_depth': None}


In [15]:
X_train, _, X_test, y_train, _, y_test = return_dataset(best_params['dataset'])
clf = RandomForestClassifier(n_estimators=best_params['n_estimators'], criterion=best_params['criterion'], max_depth=best_params['max_depth'], min_samples_leaf=best_params['min_samples_leaf'], n_jobs=-1, random_state=599131614)
clf.fit(X_train, y_train)

In [16]:
importances = clf.feature_importances_  # Feature importance scores
print(importances.shape)
indices = np.argsort(importances)[::-1]

(74552,)


In [17]:
def get_features(X_train, X_val, X_test, y_train, y_val, y_test, num_features, indices):
    top_features = indices[: num_features]
    return X_train[:, top_features], X_val[:, top_features], X_test[:, top_features], y_train, y_val, y_test

In [18]:
def train(config=None):
    with wandb.init(config=config):
        config = wandb.config
        X_train, X_val,_ , y_train, y_val, _ = get_features(*return_dataset(config.dataset), config.num_features, indices)
        clf = SVC(
            kernel=config.kernel,
            C=config.C,
            gamma=config.gamma if config.kernel in ['rbf', 'poly', 'sigmoid'] else 'scale',
            degree=config.degree if config.kernel == 'poly' else 3,
            cache_size=8000
        )
        clf.fit(X_train, y_train)
        wandb.log({'train_accuracy': accuracy_score(y_train, clf.predict(X_train)), 'val_accuracy': accuracy_score(y_val, clf.predict(X_val))})

In [None]:
wandb.agent(sweep_id, train, count=3)

[34m[1mwandb[0m: Agent Starting Run: f4if8mtz with config:
[34m[1mwandb[0m: 	C: 0.01
[34m[1mwandb[0m: 	dataset: bow
[34m[1mwandb[0m: 	degree: 2
[34m[1mwandb[0m: 	gamma: scale
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	num_features: 100


In [None]:
api = wandb.Api()
sweep = api.sweep(f'cr7_reunited-cr7/epfl_ml_project2/sweeps/{sweep_id}')

# Get best run parameters
best_run = sweep.best_run(order='val_accuracy')
best_params = best_run.config
print(best_params)

In [None]:
X_train, _, X_test, y_train, _, y_test = get_features(*return_dataset(best_params['dataset']), best_params['num_features'], indices)
clf = SVC(
    kernel=best_params['kernel'],
    C=best_params['C'],
    gamma=best_params['gamma'] if best_params['kernel'] in ['rbf', 'poly', 'sigmoid'] else 'scale',
    degree=best_params['degree'] if best_params['kernel'] == 'poly' else 3,
    cache_size=8000
)
clf.fit(X_train, y_train)
print(accuracy_score(y_train, clf.predict(X_train)))
print(accuracy_score(y_test, clf.predict(X_test)))