In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score



# Data Preprocessing

In [2]:
train_df = pd.read_csv('train.tsv', sep='\t')
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
# SentenceId 정리

train_df_length = len(train_df)

index_vector = np.empty(train_df_length, dtype='bool')

sentenceid = 0
for i in range(train_df_length):
    if train_df['SentenceId'][i] != sentenceid:
        sentenceid = train_df['SentenceId'][i]
        index_vector[i] = True
    else:
        index_vector[i] = False

train_df_filtered = train_df[index_vector]
train_df_filtered[:3]
train_df_filtered.index = range(0, len(train_df_filtered))

In [4]:
print 'Case1 : SentenceId 정리했을 때 데이터 갯수 :' ,len(train_df_filtered.index)
print 'Case2 : SentenceId 정리하지 않았을 때 데이터 갯수 :' ,len(train_df.index)

Case1 : SentenceId 정리했을 때 데이터 갯수 : 8529
Case2 : SentenceId 정리하지 않았을 때 데이터 갯수 : 156060


* SentenceId 정리하고 나니까 데이터 갯수가 정말 많이 줄어들은 것을 확인할 수 있다.

In [5]:
#case1 SentenceId 정리 했을 때

train_df_filtered['Split_Phrase'] = [[j.lower() for j in x.split()] for x in np.array(train_df_filtered['Phrase'])]
train_df_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Split_Phrase
0,1,1,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,..."
1,64,2,"This quiet , introspective and entertaining in...",4,"[this, quiet, ,, introspective, and, entertain..."
2,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1,"[even, fans, of, ismail, merchant, 's, work, ,..."
3,117,4,A positively thrilling combination of ethnogra...,3,"[a, positively, thrilling, combination, of, et..."
4,157,5,Aggressive self-glorification and a manipulati...,1,"[aggressive, self-glorification, and, a, manip..."


In [6]:
#case2 SentenceId 정리 하지 않았을 때

train_df['Split_Phrase'] = [[j.lower() for j in x.split()] for x in np.array(train_df['Phrase'])]
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Split_Phrase
0,1,1,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,..."
1,2,1,A series of escapades demonstrating the adage ...,2,"[a, series, of, escapades, demonstrating, the,..."
2,3,1,A series,2,"[a, series]"
3,4,1,A,2,[a]
4,5,1,series,2,[series]


# Case 1

### 아무래도 감정을 나타내는 품사는 주로 형용사이지만 부사, 동명사, 동사도 감정을 나타낼 수 있기 때문에 형용사, 부사, 동명사, 동사 단어를 뽑아보자.

#### 1-1 SentenceId 정리 -> 감정단어 추출 -> 감정단어로만 모델 형성 -> 교차 검증으로 모델 퍼포먼스 평가

In [7]:
def using_pos_tag(frame):
    total_list = []
    frame_length = len(frame)
    split_phrase = frame['Split_Phrase']
    frame['Feeling_words'] = np.array('nan', dtype=object)
    for i in range(frame_length):
        each_row = pos_tag(split_phrase[i])
        adj_list = []
        for j, _ in enumerate(each_row):
            if each_row[j][1] == 'JJ' or each_row[j][1] == 'RB' or each_row[j][1] == 'VBG' or each_row[j][1] == 'VB':
                adj_list.append((each_row[j][0]))
        total_list.append(adj_list)
    frame['Feeling_words'] = total_list
        
    return frame

#case1
train_df_finished1 = using_pos_tag(train_df_filtered)
#case2
train_df_finished2 = using_pos_tag(train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [8]:
train_df_finished1['Feeling_words'] = train_df_finished1['Feeling_words'].map(':'.join)
train_df_finished1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Split_Phrase,Feeling_words
0,1,1,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,...",demonstrating:good:also:good:occasionally:much
1,64,2,"This quiet , introspective and entertaining in...",4,"[this, quiet, ,, introspective, and, entertain...",quiet:introspective:entertaining:independent:w...
2,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1,"[even, fans, of, ismail, merchant, 's, work, ,...",even:ismail:i:have:hard:sitting
3,117,4,A positively thrilling combination of ethnogra...,3,"[a, positively, thrilling, combination, of, et...",positively:thrilling:shakespearean
4,157,5,Aggressive self-glorification and a manipulati...,1,"[aggressive, self-glorification, and, a, manip...",aggressive:manipulative


In [9]:
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens, lemmatizer):
    lemmatizing = [lemmatizer.lemmatize(senti_word) for senti_word in tokens]
    return lemmatizing

def tokenizer(words):
    filter_words = re.sub(r'[^a-zA-Z]', " ", words)
    tokens = nltk.word_tokenize(filter_words)
    lemmas = lemmatize_tokens(tokens, lemmatizer)
    return lemmas

In [12]:
tfidf = TfidfVectorizer(analyzer='word', tokenizer=tokenizer)
adjword_corpus = tfidf.fit_transform(train_df_finished1.Feeling_words.tolist()).todense()
adjword_corpus.shape

(8529L, 6767L)

In [13]:
X_features = adjword_corpus
target = train_df_finished1['Sentiment']
x_train, x_test, y_train, y_test = train_test_split(X_features, target, test_size=0.35)
x_train.shape

(5543L, 6767L)

In [12]:
# corpus가 어떻게 이루어져 있는가 ?
print tfidf.vocabulary_.keys()[:200]

[u'unimaginative', u'miyazaki', u'pardon', u'limited', u'dynamic', u'ryoko', u'yellow', u'four', u'asian', u'hanging', u'conjuring', u'comically', u'bestowing', u'marching', u'increase', u'buddy', u'unanswered', u'superficially', u'recycle', u'pamela', u'immature', u'sway', u'evangelical', u'updated', u'comparatively', u'rescue', u'void', u'smack', u'foul', u'unsuspecting', u'screaming', u'courageous', u'picaresque', u'disturb', u'grueling', u'unexceptional', u'called', u'wooden', u'brimming', u'peerlessly', u'frozen', u'unrelated', u'wending', u'stereotypical', u'howlingly', u'bordering', u'heading', u'winded', u'force', u'tired', u'miller', u'japanese', u'holocaust', u'elegant', u'second', u'valiant', u'shrugging', u'sterile', u'pander', u'hilariously', u'gawky', u'hate', u'hazy', u'new', u'conjure', u'ever', u'succumb', u'niche', u'hero', u'avert', u'intentioned', u'mel', u'men', u'disposable', u'weirdly', u'here', u'reported', u'ivy', u'entire', u'nosedive', u'shriek', u'interpret'

* 다양한 모델로 학습을 시켜보았다. 하지만 Cross-Validation(교차 검증)을 했을 때 모든 모델에서의 Performance는 좋지 않다. 그나마 이 중에서도 LogisticRegression, Multinomial Naive Bayes, Linear Support Vecter Machine 방법이 퍼포먼스가 좋게 나왔다. 


* 어느정도 예상은 했지만 한문장에서 모든 단어들을 학습시킨 것이 아니라, 형용사, 동명사, 동사, 부사 단어들만 학습시켰기 때문에 만약 어떤 문장에서 이런 품사가 없는 경우는 학습시킬 단어들이 없기 때문에 학습을 정확히 시킬 수 없다. 그래서 다음 실험은 감정 단어들만 따로 뽑지 않은 경우에 전체 한 문장을 학습시킬 것이다.

In [13]:
mnb = MultinomialNB()
gnb = GaussianNB()
lr = LogisticRegression()
rfc = RandomForestClassifier()
efc = ExtraTreesClassifier()
linear_svm = SVC(kernel='linear', probability=True)
rbf_svm = SVC(kernel='rbf', probability=True)
sigmoid_svm = SVC(kernel='sigmoid', probability=True)

clf_list = [mnb, gnb, lr, rfc, efc, linear_svm, rbf_svm, sigmoid_svm]

for num, clf in enumerate(clf_list):
    scores = cross_val_score(clf, X_features.todense(), target, cv=3, n_jobs = -1)
    print 'Model{} score :'.format(num+1), scores

Model1 score : [ 0.35254833  0.34716848  0.34670891]
Model2 score : [ 0.23479789  0.23496307  0.23160859]
Model3 score : [ 0.343058    0.3506859   0.35093277]
Model4 score : [ 0.31493849  0.29546254  0.30271031]
Model5 score : [ 0.32302285  0.30918044  0.3093981 ]
Model6 score : [ 0.34235501  0.3524446   0.34248504]
Model7 score : [ 0.27205624  0.27224763  0.27208729]
Model8 score : [ 0.27205624  0.27224763  0.27208729]


#### 1-2 SentenceId 정리 -> 감정단어만 따로 쓰지 않고 전체 한 문장에 쓰인 단어들로 적용.

In [14]:
train_df_finished1.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Split_Phrase,Feeling_words
0,1,1,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,...",demonstrating:good:also:good:occasionally:much
1,64,2,"This quiet , introspective and entertaining in...",4,"[this, quiet, ,, introspective, and, entertain...",quiet:introspective:entertaining:independent:w...
2,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1,"[even, fans, of, ismail, merchant, 's, work, ,...",even:ismail:i:have:hard:sitting
3,117,4,A positively thrilling combination of ethnogra...,3,"[a, positively, thrilling, combination, of, et...",positively:thrilling:shakespearean
4,157,5,Aggressive self-glorification and a manipulati...,1,"[aggressive, self-glorification, and, a, manip...",aggressive:manipulative


In [15]:
train_df_finished1['Split_Phrase'] = train_df_finished1['Split_Phrase'].map(':'.join)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
tfidf = TfidfVectorizer(analyzer='word', tokenizer=tokenizer)
features = tfidf.fit_transform(train_df_finished1.Split_Phrase.tolist()).todense()
features.shape

(8529L, 13677L)

In [57]:
print tfidf.vocabulary_.keys()[:200]

[u'unimaginative', u'ryoko', u'yellow', u'four', u'clotted', u'hanging', u'conjuring', u'woody', u'comically', u'shamble', u'payoff', u'crooned', u'hennings', u'pardon', u'wizardry', u'originality', u'superficially', u'stevenon', u'xtc', u'lore', u'lord', u'immature', u'digit', u'delf', u'bile', u'foul', u'screaming', u'picaresque', u'disturb', u'scholar', u'wooden', u'voyeur', u'wednesday', u'peerlessly', u'standout', u'succession', u'stereotypical', u'howlingly', u'straight', u'fritter', u'entertained', u'sturm', u'tired', u'miller', u'bacon', u'pulse', u'elegant', u'second', u'crisply', u'valiant', u'shrugging', u'distinctly', u'sterile', u'admire', u'deferred', u'ruthless', u'contributed', u'schmuck', u'videodrome', u'negated', u'marching', u'groupie', u'succumb', u'submerged', u'widget', u'hero', u'avert', u'reporter', u'intentioned', u'divertissement', u'error', u'here', u'reported', u'china', u'cult', u'shriek', u'natured', u'transfixes', u'substance', u'uplifting', u'k', u'cont

* 감정단어들만 따로 추려서 모델링 했을 때 보다는 조금 나아졌지만 여전히 모든 모델에서의 퍼포먼스는 좋지 않다. 그나마 이 중에서 LogisticRegression, Linear Support Vector Machine 방법으로 Cross-Validation(교차 검증)을 했을 때 퍼포먼스가 좋게 나왔다.

* 퍼포먼스가 좋지 않게 나오는 이유가 무엇일까 생각해보았다.
    * 1**.전처리가 제대로 되지 않은 것 같다.**
    
        * 1-1. Stopwords 생성 X 
        
            * Tokenizer 함수를 만들어서 단어가 아닌 것들을 제외했고, WordnetLemmatizer로 토큰들을 Lemmatizing 하면서 원형으로 만들었다. 하지만 이 중에서도 분명히 감정 분석을 하는 데 있어서 쓸모 없는 단어들이 분명히 있을 것이다. 만약 쓸모없는 단어들을 제거하는 stopwords를 만들어 주면 좀 더 정확한 분류를 할 수 있지 않을까?
            
        * 1-2. SentenceId 정리
        
            * SentenceId를 정리하면서 데이터의 수가 156060개에서 8529개로 확 줄었다. 그로 인해 데이터의 부족으로 인해서 학습하는 데 있어서 충분히 학습이 되지 않아서일까? 만약에 SentenceId를 정리 하지 않고 156060개의 데이터를 모두 활용하면 Sentiment(0~4)를 좀 더 좋은 퍼포먼스로 분류할 수 있을까?
            
    * 2**.코퍼스 구축**
    
        * 2-1. 기존 코퍼스의 Quality 부족
        
            * 기존 코퍼스의 질이 좋지 않기 때문일까? 만약에 기존 코퍼스에 부족한 부분을 채워줄 수 있는 단어들을 추가시켜 준다면 코퍼스의 Quality가 좋아 질 것이고 이것이 Performance를 좀 더 좋게 해주지 않을까?

In [58]:
mnb = MultinomialNB()
gnb = GaussianNB()
lr = LogisticRegression()
rfc = RandomForestClassifier()
efc = ExtraTreesClassifier()
linear_svm = SVC(kernel='linear', probability=True)
rbf_svm = SVC(kernel='rbf', probability=True)
sigmoid_svm = SVC(kernel='sigmoid', probability=True)

clf_list = [mnb, gnb, lr, rfc, efc, linear_svm, rbf_svm, sigmoid_svm]

for num, clf in enumerate(clf_list):
    scores = cross_val_score(clf, features, target, cv=3, n_jobs = -1)
    print 'Model{} score :'.format(num+1), scores

Model1 score : [ 0.37680141  0.38832219  0.3843717 ]
Model2 score : [ 0.29560633  0.27611678  0.28616684]
Model3 score : [ 0.39472759  0.40626099  0.39528335]
Model4 score : [ 0.30123023  0.30953218  0.31678986]
Model5 score : [ 0.33743409  0.34189237  0.34037311]
Model6 score : [ 0.38453427  0.4037988   0.40091517]
Model7 score : [ 0.27205624  0.27224763  0.27208729]
Model8 score : [ 0.27205624  0.27224763  0.27208729]


* 감정 단어만 사용 했을 때보다 모든 단어들의 정보를 전부 사용했을 때 모델 퍼포먼스가 그나마 좋게 나왔기 때문에 모든 정보를 사용할 것이다.

In [9]:
del train_df_finished2['Feeling_words']

In [10]:
train_df_finished2.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Split_Phrase
0,1,1,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,..."
1,2,1,A series of escapades demonstrating the adage ...,2,"[a, series, of, escapades, demonstrating, the,..."
2,3,1,A series,2,"[a, series]"
3,4,1,A,2,[a]
4,5,1,series,2,[series]


In [11]:
#find_stop_words = train_df_finished2.Split_Phrase.apply(lambda x : ' '.join(x)).to_csv('find_stopwords2.csv')

In [12]:
train_df_finished2.ix[[37]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Split_Phrase
37,38,1,",",2,"[,]"


In [13]:
def get_stop_words(file = '.\stopwords.txt'):
    with open(file, 'r') as f:
        stopword = f.read().split(',')
    return stopword

stopwords = get_stop_words()

In [14]:
import graphlab as gl

In [16]:
traindata_sf = gl.SFrame(train_df_finished2)

This non-commercial license of GraphLab Create for academic use is assigned to jyh0674@gmail.com and will expire on November 16, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\YOUNGH~1\AppData\Local\Temp\graphlab_server_1495984997.log.0


In [17]:
encoder = gl.feature_engineering.TFIDF(excluded_features=stopwords)
features = encoder.fit_transform(traindata_sf)
model = gl.classifier.create(features, target='Sentiment', features=['Split_Phrase'])

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: The following methods are available for this type of problem.
PROGRESS: BoostedTreesClassifier, RandomForestClassifier, DecisionTreeClassifier, LogisticClassifier
PROGRESS: The returned model will be chosen according to validation accuracy.


PROGRESS: Model selection based on validation accuracy:
PROGRESS: ---------------------------------------------
PROGRESS: BoostedTreesClassifier          : 0.541834175587
PROGRESS: RandomForestClassifier          : 0.536557793617
PROGRESS: DecisionTreeClassifier          : 0.533668339252
PROGRESS: LogisticClassifier              : 0.633794
PROGRESS: ---------------------------------------------
PROGRESS: Selecting LogisticClassifier based on validation set performance.


In [None]:
params = dict([('target', 'Sentiment'),
               ('l2_penalty', [0, 0.01, 0.1, 0.5, 1, 5, 10]),
               ('l1_penalty', [0, 0.01, 0.1, 0.5, 1, 5, 10]),
               ('lbfgs_memory_level', [10, 15, 20, 30])])
                
find_param = gl.grid_search.create(features, 
                                   gl.logistic_classifier.create, params)
find_param.get_results()

[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.job: Creating a LocalAsync environment called 'async'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-May-28-2017-01-05-0300000' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-May-28-2017-01-05-0300000' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: A job with name 'Model-Parameter-Search-May-28-2017-01-05-0300000' already exists. Renaming the job to 'Model-Parameter-Search-May-28-2017-01-05-0300000-59964'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-May-28-2017-01-05-0300000-59964' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-May-28-2017-01-05-0300000-59964' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-May-28-2017-01-05-0300001' ready for execution


model_id,l1_penalty,l2_penalty,lbfgs_memory_level,target,training_accuracy
11,1.0,0.01,10,Sentiment,0.688786364219
10,0.5,0.01,10,Sentiment,0.688799179803
39,1.0,5.0,10,Sentiment,0.688799179803
38,0.5,5.0,10,Sentiment,0.688799179803
15,0.01,0.1,10,Sentiment,0.688831218762
14,0.0,0.1,10,Sentiment,0.713558887607
17,0.5,0.1,10,Sentiment,0.688799179803
16,0.1,0.1,10,Sentiment,0.688837626554
33,5.0,1.0,10,Sentiment,0.688581314879
32,1.0,1.0,10,Sentiment,0.688786364219


In [None]:
find_param.get_best_params()

{'l1_penalty': 0,
 'l2_penalty': 5,
 'lbfgs_memory_level': 15,
 'target': 'Sentiment'}

In [26]:
model = gl.logistic_classifier.create(features, target='Sentiment', l1_penalty=0, l2_penalty=5, features=['Split_Phrase'], 
                                      max_iterations=25, lbfgs_memory_level=15)

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [27]:
model.save('sentiment_model')

In [25]:
params = dict([('target', 'Sentiment'),
               ('l2_penalty', [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
               ('l1_penalty', [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
               ('lbfgs_memory_level', [1,10,20])])
find_param2 = gl.grid_search.create(features, 
                                   gl.logistic_classifier.create, params)
find_param2.get_best_params()

[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-May-26-2017-16-56-3100000' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-May-26-2017-16-56-3100000' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: A job with name 'Model-Parameter-Search-May-26-2017-16-56-3100000' already exists. Renaming the job to 'Model-Parameter-Search-May-26-2017-16-56-3100000-fc85f'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-May-26-2017-16-56-3100000-fc85f' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-May-26-2017-16-56-3100000-fc85f' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-May-26-2017-16-56-3100001' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-May-26-2017-16-56

{'l1_penalty': 0,
 'l2_penalty': 0.4,
 'lbfgs_memory_level': 10,
 'target': 'Sentiment'}

In [45]:
model2 = gl.logistic_classifier.create(features, target='Sentiment', l1_penalty=0, l2_penalty=0.4, features=['Split_Phrase'], 
                                       lbfgs_memory_level=10, max_iterations = 25)

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [48]:
test_features = encoder.transform(test_sf)
test_result = model2.evaluate(test_features)
test_result['confusion_matrix'].print_rows(num_rows=30)
print 'Test Acuuracy', test_result['accuracy']

+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      1       |        3        |  364  |
|      0       |        2        |  284  |
|      1       |        1        |  3850 |
|      4       |        1        |   30  |
|      1       |        0        |  740  |
|      4       |        0        |   2   |
|      3       |        4        |  1098 |
|      3       |        3        |  4993 |
|      2       |        0        |  262  |
|      4       |        4        |  1158 |
|      1       |        2        |  3215 |
|      3       |        1        |  357  |
|      3       |        0        |   34  |
|      4       |        2        |  236  |
|      4       |        3        |  1304 |
|      0       |        3        |   31  |
|      1       |        4        |   41  |
|      2       |        1        |  2475 |
|      3       |        2        |  3407 |
|      0       |        4        |   12  |
|      2   

In [38]:
model2.save('Second_Model')