In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [5]:
df = pd.read_csv("model/data/train.csv")

In [6]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,text,polarity,truthful,tokens
0,0,0,"After recent week stay at the Affinia Hotels, ...",pos,dec,"[('After', 'IN'), ('recent', 'JJ'), ('week', '..."
1,1,1,"Although much too overpriced in my opinion, th...",pos,dec,"[('Although', 'IN'), ('much', 'RB'), ('overpri..."
2,2,2,The Affinia hotel in Chicago was superb. the r...,pos,dec,"[('The', 'DT'), ('Affinia', 'NNP'), ('hotel', ..."
3,3,3,THIS HOTEL IS FANTASTIC. I stayed there on my ...,pos,dec,"[('THIS', 'NNP'), ('HOTEL', 'NNP'), ('FANTASTI..."
4,4,4,The Affinia Chicago is a wonderful place to st...,pos,dec,"[('The', 'DT'), ('Affinia', 'NNP'), ('Chicago'..."


In [3]:
def get_feature(token, token_index, sent, pos_lab):
    #returns different information about the token for characterization
    token_feature = {
        'token'             : token.lower(),                        #returns the lowercase text of the token itself
        'pos_lab'           : pos_lab,                               #Part of speech of the token
        'is_first'          : token_index == 0,                     #Is the token the first in the sentence
        'is_last'           : token_index == len(sent - 1),         #Is the token the last in the sentence
        
        'is_capitalized'    : token[0].upper() == token[0],         #Is the first letter capitalized
        'is_all_caps'       : token.upper() == token,               #Is the whole word in caps
        'is_numeric'        : token.isdigit(),                      #are there any digits in the token?
        
        'prefix-1'          : token[0].lower(),                             #token prefix w/one letter
        'prefix-2'          : token[:1].lower(),                            #token prefix w/two letters
        'suffix-1'          : token[-1].lower(),                            #token suffix w/one letter
        'suffix-2'          : '' if len(token) < 2 else token[-2:], #token suffix w/two letters

        'prev-token'        : '' if token_index == 0 else sent[token_index - 1].lower(), #token immediately proceeding this token
        '2-prev-token'      : '' if token_index >= len(sent) - 2 else sent[token_index - 2].lower(), #token two preceeding this token
        'next-token'        : '' if token_index == len(sent) - 1 else sent[token_index + 1].lower(), #token after this token
        '2-next-token'      : '' if token_index >= len(sent) - 2 else sent[token_index + 2].lower(), #token 2 after this token
    }   
    return token_feature

In [4]:
def form_data(reviews):
    features = []
    for rev in reviews:
        for token_index, token_pair in enumerate(rev):
            token, pos_lab = token_pair
            features.append(get_feature(token, token_index, rev, pos_lab))
    return features
            

In [17]:
df.tokens[0]
# vecx = form_data(reviews)

"[('After', 'IN'), ('recent', 'JJ'), ('week', 'NN'), ('stay', 'VBP'), ('Affinia', 'NNP'), ('Hotels', 'NNP'), ('can', 'MD'), ('definitely', 'RB'), ('say', 'VB'), ('will', 'MD'), ('coming', 'VBG'), ('back', 'RB'), ('They', 'PRP'), ('offer', 'VBP'), ('many', 'JJ'), ('room', 'NN'), ('amenities', 'NNS'), ('services', 'NNS'), ('Just', 'NNP'), ('comfortable', 'JJ'), ('relaxed', 'JJ'), ('place', 'NN'), ('enjoyable', 'JJ'), ('experience', 'NN'), ('Affinia', 'NNP'), ('Hotel', 'NNP'), ('amazing', 'VBG'), ('customization', 'NN'), ('offered', 'VBD'), ('recommend', 'JJ'), ('Affinia', 'NNP'), ('hotels', 'NNS'), ('anyone', 'NN'), ('looking', 'VBG'), ('nice', 'JJ'), ('place', 'NN'), ('stay', 'NN')]"