In [33]:
from nltk.corpus import stopwords as nltk_stop_words
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
#from sklearn.model_selection import cross_val_score
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
import pandas as pd
import datetime
import numpy as np
from time import time
from sklearn.grid_search import GridSearchCV

In [54]:
def read_data():
    X_train = []
    X_test = []
    id_test = []
    y_train = []
    with open('products_sentiment_train.tsv') as f:
        for line in f:
            parts = line.rsplit('\t', 1)

            X_train.append(parts[0].strip())
            y_train.append(parts[1].strip())
    
    with open('products_sentiment_test.tsv') as f:
        f.readline()
        for line in f:
            parts = line.split('\t', 1)
            id_test.append(parts[0].strip())
            X_test.append(parts[1].strip())                    

    return X_train, y_train, id_test, X_test

In [11]:
def predict(predictor, data_train, y, id_test, data_test, cv_score=None):
    predictor.fit(data_train, y)
    prediction = predictor.predict(data_test)
    #print predictor
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    filepath_prediction = 'data/prediction-%s-data.csv' % timestamp
    filepath_description = 'data/prediction-%s-estimator.txt' % timestamp

    # Create a dataframe with predictions and write it to CSV file   
    predictions_df = pd.DataFrame(data=prediction, columns=['y'])
    predictions_df.to_csv(filepath_prediction, sep=',', index_label='Id')

    # Write a short description of the classifier that was used
    f = open(filepath_description, 'w')
    f.write(str(predictor))
    score = '\nCross-validation score %.8f' % cv_score    
    f.write(score)
    f.close()

In [4]:
def build_vocabulary(X_train, X_test):
    vocab = set()
    cv = CountVectorizer()
    tokenizer = cv.build_tokenizer()
    for line in X_train:
        vocab.update(tokenizer(line))
    for line in X_test:
        vocab.update(tokenizer(line))
    return vocab

In [20]:
def get_pipeline_and_params_1(vocab):
    pipeline = Pipeline([
        ('vect', CountVectorizer(vocabulary=vocab)),
        ('logreg', LogisticRegression()),
    ])
    parameters = {
        'vect__max_df': (0.6, 0.8, 1.0),
        'vect__min_df': (0, 1, 2, 5),
        'vect__stop_words': ('english', None),
        'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams        
        'logreg__C': (0.0001, 0.01, 1),
        'logreg__penalty': ('l2', 'l1'),        
    }
    return pipeline, parameters

In [26]:
def get_pipeline_and_params_2(vocab):
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(vocabulary=vocab)),
        ('logreg', LogisticRegression()),
    ])
    parameters = {
        'tfidf__max_df': (0.6, 0.8, 1.0),
        'tfidf__min_df': (0, 1, 2, 5),
        'tfidf__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
        #'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),        
    }
    return pipeline, parameters

In [63]:
def get_pipeline_and_params_3(vocab):
    # gives 0.7895 on cross-validation and 0.835 on Kaggle. done in 108.629s
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
    ])
    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        #'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 3), (1, 2)),  # unigrams or bigrams
        #'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        'clf__alpha': (0.00001, 0.000001),
        'clf__penalty': ('l2', 'elasticnet'),
        'clf__n_iter': (10, 50, 80),
    }
    return pipeline, parameters

In [96]:
def get_pipeline_and_params_4(vocab):
    nltk_sw = nltk_stop_words.words('english')
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
    ])
    parameters = {
        'vect__max_df': (0.4, 0.5, 0.6),
        #'vect__vocabulary': (None),
        #'vect__stop_words': ('english', nltk_sw, None),
        #'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1,5), (1, 4), (2, 3), (2,4), (3, 5)),  # unigrams or bigrams
        #'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', ),
        'clf__alpha': (0.000001,),
        'clf__penalty': ('elasticnet',),
        'clf__n_iter': (10,),
    }
    return pipeline, parameters

In [78]:
def do_grid_search(pipeline, parameters, X_train, y_train):
    grid_search = GridSearchCV(pipeline, parameters)
    t0 = time()
    grid_search.fit(X_train, y_train)
    print "done in %0.3fs" % (time() - t0)
    
    print("Best score: %.4f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    return grid_search

In [39]:
def do_experiment(X_train, y_train, id_test, X_test, vocab, get_pipeline_and_params):      
    pipeline, parameters = get_pipeline_and_params(vocab)
    gs = do_grid_search(pipeline, parameters, X_train, y_train)
    predict(gs.best_estimator_, X_train, y_train, id_test, X_test, gs.best_score_)

In [55]:
X_train, y_train, id_test, X_test = read_data() 
vocab = build_vocabulary(X_train, X_test)

In [29]:
do_experiment(X_train, y_train, id_test, X_test, vocab, get_pipeline_and_params_1)

done in 87.184s
Best score: 0.7740
Best parameters set:
	logreg__C: 1
	logreg__penalty: 'l2'
	vect__max_df: 0.6
	vect__min_df: 0
	vect__ngram_range: (1, 1)
	vect__stop_words: None


In [30]:
do_experiment(X_train, y_train, id_test, X_test, vocab, get_pipeline_and_params_2)

done in 16.710s
Best score: 0.7560
Best parameters set:
	tfidf__max_df: 0.6
	tfidf__min_df: 0
	tfidf__ngram_range: (1, 1)
	tfidf__norm: 'l2'


In [64]:
do_experiment(X_train, y_train, id_test, X_test, vocab, get_pipeline_and_params_3)

done in 108.629s
Best score: 0.7895
Best parameters set:
	clf__alpha: 1e-06
	clf__n_iter: 10
	clf__penalty: 'l2'
	tfidf__norm: 'l1'
	vect__max_df: 0.5
	vect__ngram_range: (1, 3)


In [97]:
do_experiment(X_train, y_train, id_test, X_test, vocab, get_pipeline_and_params_4)

done in 18.123s
Best score: 0.7910
Best parameters set:
	clf__alpha: 1e-06
	clf__n_iter: 10
	clf__penalty: 'elasticnet'
	tfidf__norm: 'l1'
	vect__max_df: 0.6
	vect__ngram_range: (1, 4)
