In [26]:
import sys
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats
from feature_engineering import word_overlap_features
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission

from utils.system import parse_params, check_version
from time import time

In [2]:
def generate_features(stances,dataset,name):
    h, b, y = [],[],[]

    for stance in stances:
        y.append(LABELS.index(stance['Stance']))
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])

    X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap."+name+".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity."+name+".npy")
    X_hand = gen_or_load_feats(hand_features, h, b, "features/hand."+name+".npy")

    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X,y

In [21]:
#Load the training dataset and generate folds
d = DataSet(path='../data/train/')
folds,hold_out = kfold_split(d,n_folds=10)
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

# Load the competition dataset
competition_dataset = DataSet("competition_test", path='../data/competition_test')
X_competition, y_competition = generate_features(competition_dataset.stances, competition_dataset, "competition")

Reading dataset
Total stances: 49972
Total bodies: 1683
Reading dataset
Total stances: 25413
Total bodies: 904


In [22]:
Xs = dict()
ys = dict()
# Load/Precompute all features now
X_holdout,y_holdout = generate_features(hold_out_stances,d,"holdout")
for fold in fold_stances:
    Xs[fold],ys[fold] = generate_features(fold_stances[fold],d,str(fold))

In [34]:
best_score = 0
best_fold = None
# Classifier for each fold
start = time()
for fold in fold_stances:
    start_fold = time()
    ids = list(range(len(folds)))
    del ids[fold]
    X_train = np.vstack(tuple([Xs[i] for i in ids]))
    y_train = np.hstack(tuple([ys[i] for i in ids]))
    X_test = Xs[fold]
    y_test = ys[fold]
    clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
    clf.fit(X_train, y_train)
    predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
    actual = [LABELS[int(a)] for a in y_test]
    fold_score, _ = score_submission(actual, predicted)
    max_fold_score, _ = score_submission(actual, actual)
    score = fold_score/max_fold_score
    print("Score for fold "+ str(fold) + " was - " + str(score))
    print("Fold "+str(fold)+" took "+str((time()-start_fold)/60.0)+" minutes")
    if score > best_score:
        best_score = score
        best_fold = clf
print("Total time taken: %s minutes"%((str(time()-start)/60.0)))

      Iter       Train Loss   Remaining Time 
         1       34847.8087            1.14m
         2       31182.6372            1.20m
         3       28224.3550            1.18m
         4       25806.7231            1.18m
         5       23799.5416            1.13m
         6       22142.4386            1.11m
         7       20746.2849            1.09m
         8       19564.1395            1.08m
         9       18573.3067            1.07m
        10       17737.9969            1.05m
        20       13733.1728           54.54s
        30       12588.4881           49.47s
        40       12126.0670           45.90s
        50       11859.0634           42.18s
        60       11665.4249           38.86s
        70       11525.6086           35.78s
        80       11412.8190           32.78s
        90       11305.9790           29.88s
       100       11208.2923           27.02s
       200       10530.1457            0.00s
Score for fold 0 was - 0.790634959549
Fold 0 took 0.87

         2       31310.1868            1.40m
         3       28367.4965            1.46m
         4       25956.3288            1.48m
         5       23970.1504            1.45m
         6       22306.7986            1.40m
         7       20924.9631            1.37m
         8       19743.5413            1.37m
         9       18752.5597            1.36m
        10       17920.2238            1.34m
        20       13930.3904            1.20m
        30       12760.8270            1.09m
        40       12296.0700            1.02m
        50       12006.1182           53.93s
        60       11809.4022           48.37s
        70       11668.9714           44.06s
        80       11533.7314           40.83s
        90       11417.6479           37.39s
       100       11320.2690           33.25s
       200       10645.9241            0.00s
Score for fold 8 was - 0.820952380952
Fold 8 took 1.03359421889 minutes
      Iter       Train Loss   Remaining Time 
         1       35487.2884

TypeError: unsupported operand type(s) for /: 'str' and 'float'

In [32]:
#Run on Holdout set and report the final score on the holdout set
predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
actual = [LABELS[int(a)] for a in y_holdout]
print("Scores on the dev set")
report_score(actual,predicted)
print("")
print("")

Scores on the dev set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    118    |     3     |    556    |    85     |
-------------------------------------------------------------
| disagree  |    14     |     3     |    130    |    15     |
-------------------------------------------------------------
|  discuss  |    58     |     5     |   1527    |    210    |
-------------------------------------------------------------
| unrelated |     5     |     1     |    98     |   6794    |
-------------------------------------------------------------
Score: 3538.0 out of 4448.5	(79.5324266607%)




In [33]:
#Run on competition dataset
predicted = [LABELS[int(a)] for a in best_fold.predict(X_competition)]
actual = [LABELS[int(a)] for a in y_competition]
print("Scores on the test set")
report_score(actual,predicted)

Scores on the test set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    173    |    10     |   1435    |    285    |
-------------------------------------------------------------
| disagree  |    39     |     7     |    413    |    238    |
-------------------------------------------------------------
|  discuss  |    221    |     7     |   3556    |    680    |
-------------------------------------------------------------
| unrelated |    10     |     3     |    358    |   17978   |
-------------------------------------------------------------
Score: 8761.75 out of 11651.25	(75.2000858277%)


75.20008582770089