### Loading data

In [None]:
import pandas as pd
import numpy as np

# Load data
full_data = pd.read_csv('input/full_data_final.csv', index_col=0)

X, y = full_data['content'].values, full_data['sarcastic'].values

print(full_data.head())

### Creating pipeline and parameters

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multiclass import OneVsRestClassifier

tfidf = TfidfVectorizer()

logit_ensemble = OneVsRestClassifier(LogisticRegression(verbose=1))

pipeline = Pipeline([
    ('tfidf', tfidf),
    ('clf', logit_ensemble)
])

params = {
    'tfidf__max_df': np.linspace(0.1, 1, 10),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)],
    'tfidf__max_features': [20000, 50000, 60000, 100000, 250000, 500000],
    'tfidf__analyzer': ('word', 'char'),
    'clf__estimator__penalty': ['l2'],
    'clf__estimator__C': [0.10, 0.5, 1, 5, 10],
    'clf__estimator__max_iter': [20, 50, 100],
}

### Creating the Randomized SKLearn search object

In [None]:
random_search = RandomizedSearchCV(pipeline, param_distributions=params,
                                   n_iter=10, cv=3, iid=False, verbose=10, n_jobs=-1)

### Fitting in the data

In [None]:
from time import time

start = time()

random_search.fit(X, y)

### Reporting the results

In [None]:
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), 5))

report(random_search.cv_results_)