In [1]:
%matplotlib inline

from IPython.display import clear_output

import numpy as np
import pandas as pd
import sklearn
import nltk
import jsonlines
import datetime
import itertools
import collections
import gc
import os
import json
import operator

In [2]:
def default_finalizer(tokens):
    for i, w in enumerate(tokens):
        if w == "n't" or w == 'not':
            for j in range(i + 1, min(i + 3, len(tokens))):
                tokens[j] = 'not_' + tokens[j]
    return [w for w in tokens if str.isalnum(w) or w.startswith('not_')]

def get_list_of_tokens(text, listfinalizer=None):
    tokens = nltk.word_tokenize(text)
    if listfinalizer:
        return listfinalizer(tokens)
    return default_finalizer(tokens)

def generate_ngrams(tokens, n_number=2):
    return ['_'.join(tokens[start_pos:start_pos + gram_len])\
            for gram_len in range(1, n_number + 1)\
            for start_pos in range(0, len(tokens) - gram_len + 1)]

toks = get_list_of_tokens("You aren't a sentence.", default_finalizer)
generate_ngrams(toks)

['You',
 'are',
 'not_a',
 'not_sentence',
 'You_are',
 'are_not_a',
 'not_a_not_sentence']

In [3]:
course_file = 'course.csv'
change_by_day = dict()
with open(course_file) as fl:
    next(fl)
    for line in fl:
        date_str, day_course_str, change_str = line.strip().split(',')
        change_by_day[datetime.datetime.strptime(date_str, '%Y-%m-%d').date()] = float(change_str)

In [32]:
step_time = datetime.time(8, 30)
min_date, max_date = min(change_by_day.keys()), max(change_by_day.keys())

def find_known_date(date, days_step):
    while min_date <= date <= max_date and date not in change_by_day:
        date += datetime.timedelta(days=days_step)
    return date

def find_course_date(date_with_time):
    if date_with_time.time() < step_time:
        start_day = date_with_time.date() + datetime.timedelta(days=1)
    else:
        start_day = date_with_time.date() + datetime.timedelta(days=2)
    return find_known_date(start_day, +1)

def add_course_change_info(frame):
    frame['course_date'] = frame['date'].apply(find_course_date)
    frame['course_change'] = frame['course_date'].apply(lambda d: change_by_day[d])
    return frame

def process_final_chunked(generator, chunksize=10000, print_progres=True, auto_collect=True):
    if print_progres:
        total_lines = sum(1 for _ in open('reuters.jl'))
    news_data = pd.read_json('reuters.jl', lines=True, chunksize=chunksize)
    total_done = 0
    for chunk in news_data:
        if auto_collect:
            gc.collect()
        final_chunk = add_course_change_info(chunk)
        yield generator(final_chunk)
        if print_progres:
            total_done = min(total_done + chunksize, total_lines)
            print("Current progress: {:.2%}, {}/{}".format(total_done / total_lines, total_done, total_lines))
            clear_output(wait=True)

class ResultFlusher:
    def __init__(self, files_names, start_index=0, save_dir='.', sort_keys=True):
        self.start_index = start_index
        self.names = files_names
        self.save_dir = save_dir
        self.sort_keys = sort_keys
    
    def save(self, *data):
        for name, data_part in zip(self.names, data):
            keys = sorted(data_part.keys())
            with open(os.path.join(self.save_dir, '{}.{}.csv'.format(name, self.start_index)), 'w') as output:
                output.write('\n'.join('{},{}'.format(key, data_part[key]) for key in keys))
        self.start_index += 1


In [14]:
def make_token_class_counter(n_grams):
    def counter(chunk):
        in_rise, in_fall = collections.Counter(), collections.Counter()
        sub_chunk = chunk[['text', 'course_change']]
        for _, row in sub_chunk.iterrows():
            if row['course_change'] > 0:
                target_counter = in_rise
            else:
                target_counter = in_fall
            tokens = get_list_of_tokens(row['text'], default_finalizer)
            target_counter.update(generate_ngrams(tokens, n_grams))
        return (in_rise, in_fall)
    
    return counter

flusher = ResultFlusher(['in_rise', 'in_fall'])
for in_rise, in_fall in process_final_chunked(make_token_class_counter(3), chunksize=100000):
    flusher.save(in_rise, in_fall)

Current progress: 100.00%, 3458938/3458938


In [1]:
def get_filtered_words(file_path, min_count=1000):
    result = dict()
    with open(file_path) as inp:
        for line in inp:
            word, count = tuple(line.rsplit(',', 1))
            count = int(count)
            if count >= min_count:
                result[word] = count
    return result

rise_words = get_filtered_words('final_in_rise.csv')
fall_words = get_filtered_words('final_in_fall.csv')

In [5]:
all_words = set(x for x in rise_words) & set(x for x in fall_words)
len(all_words), len(rise_words), len(fall_words)

(98848, 99750, 146833)

In [18]:
word_total_count = {word:(raise_words[word] + fall_words[word]) for word in all_words}

In [22]:
pmi_per_class = dict()
pmi_per_class['raise'] = {word:np.log(raise_words[word] / word_total_count[word]) for word in all_words}
pmi_per_class['fall'] = {word:np.log(fall_words[word] / word_total_count[word]) for word in all_words}

In [26]:
top_raise = sorted([(pmi, word) for word, pmi in pmi_per_class['raise'].items()], reverse=True)
top_fall = sorted([(pmi, word) for word, pmi in pmi_per_class['fall'].items()], reverse=True)

In [46]:
get_word = operator.itemgetter(1)
top_words = {'raise': list(map(get_word, top_raise[:5000])),
             'fall': list(map(get_word, top_fall[:5000]))}
with open('top_words.json', 'w') as fl:
    json.dump(top_words, fl, indent=2)

In [7]:
with open('top_words.json') as fl:
    top_words = json.load(fl)

In [26]:
def make_word_feature_analizer(feature_words, n_grams):
    def analize_chunk(chunk):
        total_lines, uncovered_lines = 0, 0
        covered_dates = set()
        for _, row in chunk.iterrows():
            total_lines += 1
            n_gram_tokens = set(generate_ngrams(get_list_of_tokens(row['text'], default_finalizer), n_grams))
            if feature_words & n_gram_tokens:
                covered_dates.add(row['course_date'])
            else:
                uncovered_lines += 1
        return (total_lines, uncovered_lines, covered_dates)
    
    return analize_chunk

all_features = set(top_words['raise']) | set(top_words['fall'])
total_lines, uncovered_lines, covered_dates = 0, 0, set()
for chunk_lines, chunk_uncovered_lines, chunk_covered_dates in\
        process_final_chunked(make_word_feature_analizer(all_features, 3)):
    total_lines += chunk_lines
    uncovered_lines += chunk_uncovered_lines
    covered_dates |= chunk_covered_dates

Current progress: 100.00%, 3458938/3458938


In [27]:
uncovered_lines / total_lines

0.017003484884666912

In [28]:
len(covered_dates) / len(change_by_day)

0.7390821613619541

In [33]:
def make_additive_feature_processor(feature_words, n_grams, skip_empty=True):
    def get_features(chunk):
        date_features = collections.defaultdict(collections.Counter)
        for _, row in chunk.iterrows():
            n_grams_tokens = generate_ngrams(get_list_of_tokens(row['text'], default_finalizer), n_grams)
            date_features[row['course_date']] += collections.Counter(nt for nt in n_grams_tokens if nt in feature_words)
        return date_features
    
    return get_features

date_data = collections.defaultdict(collections.Counter)
for chunk_data in process_final_chunked(make_additive_feature_processor(all_features, 3)):
    for date, features in chunk_data.items():
        date_data[date] += features


Current progress: 100.00%, 3458938/3458938


In [35]:
features_list = [{'date': date, **features} for date, features in date_data.items()]

In [56]:
day_features = pd.DataFrame(features_list).fillna(0)
day_features = day_features[['date'] + sorted(set(day_features.columns) - {'date'})]
day_features

Unnamed: 0,date,0,01,01_Aug,01_Nov,01_Nov_AMC,02_Aug,02_Feb,02_Nov,03_Aug,...,zone_bonds,zone_crisis,zone_debt,zone_debt_crisis,zone_governments,zone_s_debt,zone_s_economy,zone_sovereign,zone_sovereign_debt,és
0,2014-12-09,11.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,2.0,2.0,3.0,0.0,4.0,1.0,0.0,0.0
1,2014-12-12,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.0,1.0,0.0,1.0,0.0,5.0,2.0,0.0,0.0
2,2014-12-11,4.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
3,2014-12-05,17.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2014-12-03,1.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,2014-12-10,7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0
6,2014-12-06,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,2014-12-04,9.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
8,2016-11-08,90.0,23.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2016-11-04,10.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
day_features = pd.read_csv('day_features.csv', parse_dates=['date'])
day_features['date'] = day_features['date'].dt.date

In [72]:
y_by_day = {k:np.sign(v) for k, v in change_by_day.items()}
X_all = day_features.drop(columns=['date']).as_matrix()
y_sign = day_features['date'].apply(y_by_day.get).as_matrix()

In [35]:
def test_classifier(cls, X, y, stratify=True, repeats=10):
    if stratify:
        kf = sklearn.model_selection.RepeatedStratifiedKFold(n_splits=3, n_repeats=repeats, random_state=42)
        splits = kf.split(X, y)
    else:
        kf = sklearn.model_selection.RepeatedKFold(n_splits=3, n_repeats=repeats, random_state=42)
        splits = kf.split(X)
    accuracy = []
    for train_index, test_index in splits:
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        cls.fit(X_train, y_train)
        accuracy.append(cls.score(X_test, y_test))
    
    return accuracy

In [82]:
np.mean(test_classifier(sklearn.linear_model.LogisticRegression(), X_all, y_sign, stratify=False))

0.55408197671355575

In [93]:
np.mean(test_classifier(sklearn.linear_model.LogisticRegression()), X_all, y_sign)

0.55693392640761075

In [94]:
y_all = day_features['date'].apply(change_by_day.get).as_matrix()

In [104]:
regression_R2 = test_classifier(sklearn.linear_model.LinearRegression(), X_all, y_all, stratify=False, repeats=100)
np.mean(regression_R2), np.std(regression_R2)

(-26.076787200237391, 39.166050936456131)

In [116]:
w = np.linalg.lstsq(X_all, y_all)[0]
np.sum((y_all - X_all.dot(w)) ** 2)

6.9032242208764156e-24

In [80]:
liner_SVM_results = test_classifier(sklearn.svm.LinearSVC(), X_all, y_sign)

In [81]:
np.mean(liner_SVM_results)

0.54771719087508552

In [86]:
svm = sklearn.svm.SVC()
svm_parameters = {
    'gamma': [10**p for p in range(-4, 5)],
    'C': [10**p for p in range(-4, 5)]
}
grid_search_cls = sklearn.model_selection.GridSearchCV(svm, svm_parameters, n_jobs=8, verbose=2)
grid_search_cls.fit(X_all, y_sign)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] C=0.0001, gamma=0.0001 ..........................................
[CV] C=0.0001, gamma=0.0001 ..........................................
[CV] C=0.0001, gamma=0.0001 ..........................................
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] C=0.0001, gamma=0.01 ............................................
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ........................... C=0.0001, gamma=0.0001, total= 1.0min
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ........................... C=0.0001, gamma=0.0001, total= 1.0min
[CV] C=0.0001, gamma=0.1 .............................................
[CV] ............................ C=0.0001, gamma=0.001, total= 1.0min
[CV] C=0.0001, 

[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:  6.9min


[CV] ............................ C=0.0001, gamma=10000, total= 1.1min
[CV] C=0.001, gamma=0.01 .............................................
[CV] ............................ C=0.0001, gamma=10000, total= 1.1min
[CV] C=0.001, gamma=0.01 .............................................
[CV] ............................ C=0.001, gamma=0.0001, total= 1.1min
[CV] C=0.001, gamma=0.01 .............................................
[CV] ............................ C=0.001, gamma=0.0001, total= 1.1min
[CV] C=0.001, gamma=0.1 ..............................................
[CV] ............................ C=0.001, gamma=0.0001, total= 1.1min
[CV] C=0.001, gamma=0.1 ..............................................
[CV] ............................. C=0.001, gamma=0.001, total= 1.1min
[CV] C=0.001, gamma=0.1 ..............................................
[CV] ............................. C=0.001, gamma=0.001, total= 1.1min
[CV] C=0.001, gamma=1 ................................................
[CV] .

[CV] .............................. C=0.1, gamma=0.0001, total= 1.1min
[CV] C=0.1, gamma=0.1 ................................................
[CV] ............................... C=0.1, gamma=0.001, total= 1.1min
[CV] C=0.1, gamma=0.1 ................................................
[CV] ............................... C=0.1, gamma=0.001, total= 1.1min
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................ C=0.1, gamma=0.01, total= 1.1min
[CV] C=0.1, gamma=1 ..................................................
[CV] ............................... C=0.1, gamma=0.001, total= 1.1min
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................ C=0.1, gamma=0.01, total= 1.1min
[CV] C=0.1, gamma=10 .................................................
[CV] ................................ C=0.1, gamma=0.01, total= 1.1min
[CV] C=0.1, gamma=10 .................................................
[CV] .

[CV] ................................. C=10, gamma=0.01, total= 1.0min
[CV] C=10, gamma=1 ...................................................
[CV] ................................. C=10, gamma=0.01, total= 1.0min
[CV] ................................. C=10, gamma=0.01, total= 1.1min
[CV] C=10, gamma=10 ..................................................
[CV] C=10, gamma=10 ..................................................
[CV] .................................. C=10, gamma=0.1, total= 1.1min
[CV] C=10, gamma=10 ..................................................
[CV] .................................. C=10, gamma=0.1, total= 1.0min
[CV] C=10, gamma=100 .................................................


[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed: 33.4min


[CV] .................................. C=10, gamma=0.1, total= 1.0min
[CV] C=10, gamma=100 .................................................
[CV] .................................... C=10, gamma=1, total= 1.0min
[CV] C=10, gamma=100 .................................................
[CV] .................................... C=10, gamma=1, total= 1.0min
[CV] C=10, gamma=1000 ................................................
[CV] .................................... C=10, gamma=1, total= 1.0min
[CV] C=10, gamma=1000 ................................................
[CV] ................................... C=10, gamma=10, total= 1.0min
[CV] C=10, gamma=1000 ................................................
[CV] ................................... C=10, gamma=10, total= 1.0min
[CV] C=10, gamma=10000 ...............................................
[CV] ................................... C=10, gamma=10, total= 1.1min
[CV] C=10, gamma=10000 ...............................................
[CV] .

[CV] ................................. C=1000, gamma=10, total= 1.0min
[CV] C=1000, gamma=1000 ..............................................
[CV] ................................. C=1000, gamma=10, total= 1.0min
[CV] C=1000, gamma=10000 .............................................
[CV] ................................. C=1000, gamma=10, total= 1.0min
[CV] C=1000, gamma=10000 .............................................
[CV] ................................ C=1000, gamma=100, total= 1.0min
[CV] C=1000, gamma=10000 .............................................
[CV] ................................ C=1000, gamma=100, total= 1.0min
[CV] C=10000, gamma=0.0001 ...........................................
[CV] ................................ C=1000, gamma=100, total= 1.0min
[CV] C=10000, gamma=0.0001 ...........................................
[CV] ............................... C=1000, gamma=1000, total= 1.0min
[CV] C=10000, gamma=0.0001 ...........................................
[CV] .

[Parallel(n_jobs=8)]: Done 243 out of 243 | elapsed: 53.4min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [95]:
pd.DataFrame(grid_search_cls.cv_results_)['mean_test_score'].max()



0.50726089133700547

In [115]:
linear_svm = sklearn.svm.LinearSVC(dual=False)
lsvm_parameters = {
    'C': [10**p for p in range(-3, 4)],
    'loss': ['hinge', 'squared_hinge'],
    'penalty': ['l1', 'l2'],
}
grid_search_cls = sklearn.model_selection.GridSearchCV(linear_svm, lsvm_parameters, n_jobs=8, verbose=1, error_score=-1)
grid_search_cls.fit(X_all, y_sign)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


ValueError("Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=False",)
ValueError("Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=False",)
ValueError("Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=False",)
ValueError("Unsupported set of arguments: The combination of penalty='l2' and loss='hinge' are not supported when dual=False, Parameters: penalty='l2', loss='hinge', dual=False",)
ValueError("Unsupported set of arguments: The combination of penalty='l2' and loss='hinge' are not supported when dual=False, Parameters: penalty='l2', loss='hinge', dual=False",)
ValueError("Unsupported set of arguments: The combination of penalty='l2' and loss='hinge' are not supported when dual=False, Parameters: penalty='l2',

ValueError("Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=False",)
ValueError("Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=False",)
ValueError("Unsupported set of arguments: The combination of penalty='l2' and loss='hinge' are not supported when dual=False, Parameters: penalty='l2', loss='hinge', dual=False",)
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:  3.4min
ValueError("Unsupported set of arguments: The combination of penalty='l2' and loss='hinge' are not supported when dual=False, Parameters: penalty='l2', loss='hinge', dual=False",)
ValueError("Unsupported set of arguments: The combination of penalty='l2' and loss='hinge' are not supported when dual=False, Parameters: penalty='l2', loss='hinge', dual=False",)
ValueError("Unsupported set of arguments: The combination of penalty='l1'

ValueError("Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=False",)
ValueError("Unsupported set of arguments: The combination of penalty='l2' and loss='hinge' are not supported when dual=False, Parameters: penalty='l2', loss='hinge', dual=False",)
ValueError("Unsupported set of arguments: The combination of penalty='l2' and loss='hinge' are not supported when dual=False, Parameters: penalty='l2', loss='hinge', dual=False",)
ValueError("Unsupported set of arguments: The combination of penalty='l2' and loss='hinge' are not supported when dual=False, Parameters: penalty='l2', loss='hinge', dual=False",)
[Parallel(n_jobs=8)]: Done  84 out of  84 | elapsed:  9.8min finished


GridSearchCV(cv=None, error_score=0,
       estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'loss': ['hinge', 'squared_hinge'], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [122]:
grid_result = pd.DataFrame(grid_search_cls.cv_results_)[['param_C', 'param_penalty', 'param_loss', 'mean_test_score']].sort_values('mean_test_score', ascending=[0])
grid_result = grid_result.loc[grid_result['mean_test_score'] > -0]
grid_result



Unnamed: 0,param_C,param_penalty,param_loss,mean_test_score
2,0.001,l1,squared_hinge,0.576365
6,0.01,l1,squared_hinge,0.550325
27,1000.0,l2,squared_hinge,0.542814
15,1.0,l2,squared_hinge,0.541312
3,0.001,l2,squared_hinge,0.540811
7,0.01,l2,squared_hinge,0.538307
10,0.1,l1,squared_hinge,0.537306
23,100.0,l2,squared_hinge,0.536304
11,0.1,l2,squared_hinge,0.536304
19,10.0,l2,squared_hinge,0.535804


In [123]:
linear_svm = sklearn.svm.SVC(kernel='linear')
lsvm_parameters = {
    'C': [10**p for p in range(-4, 5)],
}
grid_search_cls = sklearn.model_selection.GridSearchCV(linear_svm, lsvm_parameters, n_jobs=8, verbose=1)
grid_search_cls.fit(X_all, y_sign)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=8)]: Done  27 out of  27 | elapsed:  5.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [124]:
grid_result = pd.DataFrame(grid_search_cls.cv_results_)[['param_C', 'mean_test_score']].sort_values('mean_test_score', ascending=[0])
grid_result



Unnamed: 0,param_C,mean_test_score
0,0.0001,0.533801
1,0.001,0.52679
2,0.01,0.52679
3,0.1,0.52679
4,1.0,0.52679
5,10.0,0.52679
6,100.0,0.52679
7,1000.0,0.52679
8,10000.0,0.52679
