In [33]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
import pandas as pd
import datetime
import numpy as np
from time import time
from sklearn.grid_search import GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
from lxml import etree

In [3]:
def add_outer_tags(filename):
    with open(filename, 'r+') as f:        
        content = f.read()        
        f.seek(0, 0)
        firstline = f.readline()
        if '<data>' not in firstline:            
            f.seek(0, 0)
            f.write('<data>' + '\n' + content + '\n</data>')        

In [4]:
def make_target(row):
   if row['rating'] < 4:
      return 0
   else:
      return 1

In [5]:
def prepare_data_set(data):
    data['target'] = data.apply(lambda row: make_target(row),axis=1)
    data = data.dropna(subset=['target'])
    data['target'] = data['target'].astype(int)   
    return data

In [6]:
def iter_tree(etree):
    n = -1
    for review in etree.iter('review'):
        n += 1
        yield (n, review.text)

In [7]:
def get_dataframes(training_data_file, test_data_file):
    df_training = pd.read_json(training_data_file)
    df_training = prepare_data_set(df_training)
    
    add_outer_tags(test_data_file)
    tree = etree.parse(test_data_file)
    df_test = pd.DataFrame(list(iter_tree(tree)), columns=['id', 'text'])
    
    return df_training, df_test

In [8]:
def read_data(df_training, df_test):
    X_train = df_training['text'].values
    X_test = df_test['text'].values
    id_test = df_test['id'].values
    y_train = df_training['target'].values              
    print len(X_train)
    return X_train, y_train, id_test, X_test

In [9]:
def get_pipeline_and_params_1():    
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
    ])
    parameters = {
        'vect__max_df': (0.75, 1),
        #'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 3), (1, 2)),  # unigrams or bigrams
        #'tfidf__use_idf': (True, False),
        #'tfidf__norm': ('l1', 'l2'),
        #'clf__alpha': (0.00001, 0.000001),
        #'clf__penalty': ('l2', 'elasticnet'),
        #'clf__n_iter': (10, 50, 80),
    }
    return pipeline, parameters

In [51]:
def get_pipeline_and_params_2():
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('logreg', LogisticRegression()),
    ])
    parameters = {
        'tfidf__max_df': (0.6,0.8,1),
        #'tfidf__min_df': (0, 5, 10, 15),
        'tfidf__ngram_range': ((1, 1), (1, 2), (1,3), (2,3)),  # unigrams or bigrams
        #'tfidf__use_idf': (True, False),
        #'tfidf__norm': ('l1', 'l2'),   
        #'logreg__C': (0.0001, 0.01, 1),
        #'logreg__penalty': ('l2', 'l1'),  
    }
    return pipeline, parameters

In [61]:
def get_pipeline_and_params_3():
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('logreg', LogisticRegression()),
    ])
    parameters = {
        'vect__max_df': (0.6, 1.0),
        'vect__min_df': (0, 5),
        #'vect__stop_words': ('english', None),
        'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams        
        #'logreg__C': (0.0001, 0.01, 1),
        #'logreg__penalty': ('l2', 'l1'),        
    }
    return pipeline, parameters

In [34]:
def predict(predictor, data_train, y, id_test, data_test, cv_score=None):
    predictor.fit(data_train, y)
    joblib.dump(predictor, './SentimentAnalysisModel.pkl')    
    prediction = predictor.predict(data_test)
    #print predictor
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    filepath_prediction = 'data/prediction-%s-data.csv' % timestamp
    filepath_description = 'data/prediction-%s-estimator.txt' % timestamp
    
    #############
    df_true = pd.read_csv('etalon.csv')
    y_true = df_true['y'].values
    print 'ACCURACY: %.6f' % accuracy_score(y_true, prediction)
    #############

    prediction_str = ['pos' if p == 1 else 'neg' for p in prediction]
    # Create a dataframe with predictions and write it to CSV file   
    predictions_df = pd.DataFrame(data=prediction_str, columns=['y'])
    predictions_df.to_csv(filepath_prediction, sep=',', index_label='Id')

    # Write a short description of the classifier that was used
    f = open(filepath_description, 'w')
    f.write(str(predictor))
    score = '\nCross-validation score %.8f' % cv_score    
    f.write(score)
    f.close()

In [11]:
def do_grid_search(pipeline, parameters, X_train, y_train):
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy')
    t0 = time()
    grid_search.fit(X_train, y_train)
    print "done in %0.3fs" % (time() - t0)
    
    print("Best score: %.4f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    return grid_search

In [12]:
def do_experiment(X_train, y_train, id_test, X_test, get_pipeline_and_params):      
    pipeline, parameters = get_pipeline_and_params()
    gs = do_grid_search(pipeline, parameters, X_train, y_train)
    predict(gs.best_estimator_, X_train, y_train, id_test, X_test, gs.best_score_)

In [56]:
df_training, df_test = get_dataframes('reviews11665.json', 'test.csv')

In [57]:
df_training.head()

Unnamed: 0,rating,text,target
0,1,"В сегменте бюджетников, за такую цену, недоста...",0
1,1,"Недостатков кроме маленькой памяти нет, разве ...",0
2,1,Нет гарнитуры в комплекте. Пожадничали корейцы...,0
3,1,Постоянно глючит. Нормально смогла попользоват...,0
4,1,Сказать что батарея слабая-не сказать ничего. ...,0


In [58]:
df_test.head()

Unnamed: 0,id,text
0,0,"Ужасно слабый аккумулятор, это основной минус ..."
1,1,ценанадежность-неубиваемостьдолго держит батар...
2,2,"подробнее в комментариях\nК сожалению, факт по..."
3,3,я любительница громкой музыки. Тише телефона у...
4,4,"Дата выпуска - 2011 г, емкость - 1430 mAh, тех..."


In [59]:
X_train, y_train, id_test, X_test = read_data(df_training, df_test) 

11665


In [60]:
do_experiment(X_train, y_train, id_test, X_test, get_pipeline_and_params_1)

done in 41.060s
Best score: 0.9459
Best parameters set:
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)
ACCURACY: 0.970000


In [62]:
do_experiment(X_train, y_train, id_test, X_test, get_pipeline_and_params_2)

done in 92.748s
Best score: 0.9314
Best parameters set:
	tfidf__max_df: 0.6
	tfidf__ngram_range: (1, 2)
ACCURACY: 0.930000


In [63]:
do_experiment(X_train, y_train, id_test, X_test, get_pipeline_and_params_3)

done in 38.136s
Best score: 0.9324
Best parameters set:
	vect__max_df: 0.6
	vect__min_df: 0
	vect__ngram_range: (1, 2)
ACCURACY: 0.950000
