In [1]:
import pandas as pd
import numpy as np
import graphlab as gl

In [2]:
train_df = pd.read_csv('train.tsv', sep='\t')
harvard_corpus = pd.read_csv('harvard_corpus.csv')

In [3]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
harvard_corpus.head()

Unnamed: 0,Phrase,Positive,Negative,Strong,Power,Sentiment
0,a,0,0,0,0,2
1,abandon,0,1,0,0,1
2,abandonment,0,1,0,0,1
3,abate,0,1,0,0,1
4,abatement,0,0,0,0,2


In [5]:
using_column_to_use = ['Phrase', 'Sentiment']
upgrade_train_df = train_df[using_column_to_use].append(harvard_corpus[using_column_to_use])
upgrade_train_df.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


In [15]:
upgrade_train_df['Split_Phrase'] = [[j.lower() for j in x.split()] for x in np.array(upgrade_train_df['Phrase'])]
upgrade_train_df.head()

Unnamed: 0,Phrase,Sentiment,Split_Phrase
0,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,..."
1,A series of escapades demonstrating the adage ...,2,"[a, series, of, escapades, demonstrating, the,..."
2,A series,2,"[a, series]"
3,A,2,[a]
4,series,2,[series]


In [16]:
def get_stop_words(file = '.\stopwords.txt'):
    with open(file, 'r') as f:
        stopword = f.read().split(',')
    return stopword

stopwords = get_stop_words()

In [17]:
train_sf = gl.SFrame(upgrade_train_df)
train_sf.head()

This non-commercial license of GraphLab Create for academic use is assigned to jyh0674@gmail.com and will expire on November 16, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\YOUNGH~1\AppData\Local\Temp\graphlab_server_1496243256.log.0


Phrase,Sentiment,Split_Phrase
A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrat ..."
A series of escapades demonstrating the adage ...,2,"[a, series, of, escapades, demonstrat ..."
A series,2,"[a, series]"
A,2,[a]
series,2,[series]
of escapades demonstrating the adage ...,2,"[of, escapades, demonstrating, the, ..."
of,2,[of]
escapades demonstrating the adage that what is ...,2,"[escapades, demonstrating, the, ..."
escapades,2,[escapades]
demonstrating the adage that what is good for ...,2,"[demonstrating, the, adage, that, what, is, ..."


In [23]:
encoder = gl.feature_engineering.TFIDF(excluded_features=stopwords)
corpus = encoder.fit_transform(train_sf)
model = gl.classifier.create(corpus, target='Sentiment', features=['Split_Phrase'])

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: The following methods are available for this type of problem.
PROGRESS: BoostedTreesClassifier, RandomForestClassifier, DecisionTreeClassifier, LogisticClassifier
PROGRESS: The returned model will be chosen according to validation accuracy.


PROGRESS: Model selection based on validation accuracy:
PROGRESS: ---------------------------------------------
PROGRESS: BoostedTreesClassifier          : 0.548057734966
PROGRESS: RandomForestClassifier          : 0.534326970577
PROGRESS: DecisionTreeClassifier          : 0.538434445858
PROGRESS: LogisticClassifier              : 0.627274
PROGRESS: ---------------------------------------------
PROGRESS: Selecting LogisticClassifier based on validation set performance.


In [24]:
params = dict([('target', 'Sentiment'),
               ('l2_penalty', [0, 0.01, 0.1, 0.5, 1, 5, 10]),
               ('l1_penalty', [0, 0.01, 0.1, 0.5, 1, 5, 10]),
               ('lbfgs_memory_level', [15, 20, 30])])
                
find_param = gl.grid_search.create(corpus, 
                                   gl.logistic_classifier.create, params)
find_param.get_best_params()

[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.job: Creating a LocalAsync environment called 'async'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Jun-01-2017-00-20-0300000' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Jun-01-2017-00-20-0300000' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: A job with name 'Model-Parameter-Search-Jun-01-2017-00-20-0300000' already exists. Renaming the job to 'Model-Parameter-Search-Jun-01-2017-00-20-0300000-594ae'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Jun-01-2017-00-20-0300000-594ae' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Jun-01-2017-00-20-0300000-594ae' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Jun-01-2017-00-20-0300001' ready for execution


{'l1_penalty': 0,
 'l2_penalty': 1,
 'lbfgs_memory_level': 20,
 'target': 'Sentiment'}

In [26]:
model = gl.logistic_classifier.create(corpus, target='Sentiment',features=['Split_Phrase'], 
                                      l1_penalty=0, l2_penalty=1, lbfgs_memory_level=20, max_iterations=20)

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [27]:
model.save('corpus_model')