# TextEmbedding and Logistic
In this demo, we will go through a sample project to show how to build a project by applying **`TextEmbedding`** and **`Logistic`** Tempaltes. In the project of amazon_reviews, we are trying to decide customers’ sentiment given their contents of the review.

In [1]:
import esppy
esp = esppy.ESP('http://lax95d01.unx.sas.com:40012')

b'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html><head> <meta http-equiv="Content-Type" CONTENT="text/html; charset=utf-8"> <title>ERROR: The requested URL could not be retrieved</title> <style type="text/css"><!--     body :lang(fa) { direction: rtl; font-size: 100%; font-family: Tahoma, Roya, sans-serif; float: right; } :lang(he) { direction: rtl; }  --></style> </head><body id=ERR_CONNECT_FAIL> <div id="titles"> <h1>ERROR</h1> <h2>The requested URL could not be retrieved</h2> </div> <hr>  <div id="content"> <p>The following error was encountered while trying to retrieve the URL: <a href="http://lax95d01.unx.sas.com:40012/SASESP/server?config=true">http://lax95d01.unx.sas.com:40012/SASESP/server?config=true</a></p>  <blockquote id="error"> <p><b>Connection to 10.24.7.194 failed.</b></p> </blockquote>  <p id="sysmsg">The system returned: <i>(111) Connection refused</i></p>  <p>The remote host or network may be down. Please try the requ

ParseError: mismatched tag: line 1, column 417 (<string>)

### Step 1 - Data preporcessing

In [None]:
import pandas
train_data = pandas.read_csv('reviews_train_5000.csv', header=None, 
                             names=["id", "title", "content", "rank", "sentiment"])
 
score_data = pandas.read_csv('reviews_test_1000.csv', header=None, 
                             names=["id", "title", "content", "rank", "sentiment"])
 

seed = 1234
n_samples = 5000
train_ratio = 0.5
train_data_sample_pos = train_data.loc[train_data['sentiment'] == 1.0].sample(int(n_samples * train_ratio), random_state=seed)

#since the target variable(sentiment) is highy unbalanced, we are doing resamling here
train_data_sample_neg = train_data.loc[train_data['sentiment'] == 0.0].sample(int(n_samples * (1 - train_ratio)), replace=True, random_state=seed)
train_data_sample = pandas.concat([train_data_sample_pos, train_data_sample_neg])
 
from sklearn.utils import shuffle
train_data_sample_shuffled = shuffle(train_data_sample, random_state=seed)

In [None]:
# train data
train_data_sample_shuffled.head()

In [None]:
# score data
score_data.head()

### Step 2 - Model Construction

In [None]:
# Create a project 


proj = esp.create_project('amazon_reviews')
esppy.options.display.image_scale = 1
proj

### Step 3: Training Data Stream

In [None]:
# Define a source window

src = esp.SourceWindow(schema=('id*:int64', 'content:string', 'sentiment:string'),
                       index_type='empty', insert_only=True, autogen_key=True)


# Initialize a TextEmbedding Temaplate

t1 = esp.Template.TextEmbedding('t1')

# Add corresponding edge between windows

src.add_target(t1, role='data')
src.add_target(t1.windows['w_join'], role='data')

proj.windows['w_data_t'] = src
proj.add_template(t1)
proj

### Step 4: Validation Data Stream

In [None]:
# Define a source window

src2 = esp.SourceWindow(schema=('id*:int64', 'content:string', 'sentiment:string'),
                       index_type='empty', insert_only=True, autogen_key=True)


# Make a copy of t1

t2 = t1.copy('t2', deep=True, internal_only=True)

# Add corresponding edge between windows

src2.add_target(t2, role='data')
src2.add_target(t2.windows['w_join'], role='data')

proj.windows['w_data_v'] = src2
proj.add_template(t2)
proj

### Step 5: Streaming Logistic Regression

In [None]:
# Initialize a Logis Template

t3 = esp.Template.Logistic('t3')

# add connectivities between corresponding windows

t1.add_target(t3, role='data')
t2.add_target(t3.windows['w_score_logis'], role='data')
proj.add_template(t3)
esppy.options.display.image_scale = 0.65
proj

### Step 6: Online Model Measure

In [None]:
comp_logis = esp.ComputeWindow("w_comp_logis", 
                               schema=['id*:int64', 'sentiment:string',
                                       'predicted_y:double', 'p_1:double', 'p_0:double'])
#predicted_y is actually the predicted P(sentiment = 1)
comp_logis.add_field_expression("tostring(tointeger(sentiment))")
comp_logis.add_field_expression("predicted_y")
comp_logis.add_field_expression("predicted_y")
comp_logis.add_field_expression("1-predicted_y")
 
fitstat_logis = esp.calculate.FitStat(schema=('id*:int64','mceOut:double'),
                                      classLabels='0,1',
                                      windowLength=200)
fitstat_logis.set_inputs(inputs=('p_0:double', 'p_1:double'), 
                         response=('sentiment:string'))
fitstat_logis.set_outputs(mceOut='mceOut:double')
 
proj.windows['w_comp_logis'] = comp_logis
proj.windows['w_fitstat_logis'] = fitstat_logis
t3.add_target(comp_logis, role='data')
comp_logis.add_target(fitstat_logis, role='data')
proj

### Step 7:  Data Streaming and Processing

In [None]:
# Load project to esp server

esp.load_project(proj)

In [None]:
#subscribe necessary windows
fitstat_logis.subscribe()

# stream the training data into the engine
src.publish_events(train_data_sample_shuffled, pause=15)
src2.publish_events(score_data, pause=100)

In [None]:
#create a streaming line that visualize mceOut(Mean consequential error) in real time, as new events arrive. 
fitstat_logis.streaming_line('id', ['mceOut'], steps=1000.0, interval=100, max_data=50, y_range=[0,0.8])

In [None]:
fitstat_logis

### Step 8:  Clean Up

In [None]:
fitstat_logis.unsubscribe()

esp.delete_project(name='amazon_reviews')

In [None]:
proj

In [None]:
proj.save_xml('amazon.xml')