In [1]:
import time
import nltk
import pandas as pd
from stop_words import get_stop_words
import multiprocessing as mp
import re
from tqdm import tqdm

from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stop_words = get_stop_words('en') 

[nltk_data] Downloading package punkt to C:\Users\Drew
[nltk_data]     Meseck\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Drew Meseck\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df = pd.read_csv('data/spam_review_data.csv', engine= 'python')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,polarity,truthful
0,0,"After recent week stay at the Affinia Hotels, ...",pos,dec
1,1,"Although much too overpriced in my opinion, th...",pos,dec
2,2,The Affinia hotel in Chicago was superb. the r...,pos,dec
3,3,THIS HOTEL IS FANTASTIC. I stayed there on my ...,pos,dec
4,4,The Affinia Chicago is a wonderful place to st...,pos,dec


In [4]:
def tokenize_data(text):
    text = re.sub('[!@#$%^&*-+=_]', '', text)
    text = nltk.word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    text = [word for word in text if len(word) > 2]
    #pbar.update(1)
    return nltk.pos_tag(text)
    

In [5]:
sample_text = df.text[0]
type(tokenize_data(sample_text)[0])

tuple

In [6]:
def tknz(col):
    return [tokenize_data(row) for row in col]


In [7]:
df['tokens'] = tknz(df.text)

In [8]:
df.to_csv("data/train.csv")

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,polarity,truthful,tokens
0,0,"After recent week stay at the Affinia Hotels, ...",pos,dec,"[(After, IN), (recent, JJ), (week, NN), (stay,..."
1,1,"Although much too overpriced in my opinion, th...",pos,dec,"[(Although, IN), (much, RB), (overpriced, VBN)..."
2,2,The Affinia hotel in Chicago was superb. the r...,pos,dec,"[(The, DT), (Affinia, NNP), (hotel, NN), (Chic..."
3,3,THIS HOTEL IS FANTASTIC. I stayed there on my ...,pos,dec,"[(THIS, NNP), (HOTEL, NNP), (FANTASTIC, NNP), ..."
4,4,The Affinia Chicago is a wonderful place to st...,pos,dec,"[(The, DT), (Affinia, NNP), (Chicago, NNP), (w..."


In [26]:
def get_feature(token, token_index, sent, pos_lab):
    #returns different information about the token for characterization
    token_feature = {
        'token'             : token.lower(),                        #returns the lowercase text of the token itself
        'pos_lab'           : pos_lab,                               #Part of speech of the token
        'is_first'          : token_index == 0,                     #Is the token the first in the sentence
        'is_last'           : token_index == len(sent) - 1,         #Is the token the last in the sentence
        
        'is_capitalized'    : token[0].upper() == token[0],         #Is the first letter capitalized
        'is_all_caps'       : token.upper() == token,               #Is the whole word in caps
        'is_numeric'        : token.isdigit(),                      #are there any digits in the token?
        
        'prefix-1'          : token[0].lower(),                             #token prefix w/one letter
        'prefix-2'          : token[:1].lower(),                            #token prefix w/two letters
        'suffix-1'          : token[-1].lower(),                            #token suffix w/one letter
        'suffix-2'          : '' if len(token) < 2 else token[-2:], #token suffix w/two letters

        'prev-token'        : '' if token_index == 0 else sent[token_index - 1].lower(), #token immediately proceeding this token
        '2-prev-token'      : '' if token_index >= len(sent) - 2 else sent[token_index - 2][0].lower(), #token two preceeding this token
        'next-token'        : '' if token_index == len(sent) - 1 else sent[token_index + 1][0].lower(), #token after this token
        '2-next-token'      : '' if token_index >= len(sent) - 2 else sent[token_index + 2][0].lower(), #token 2 after this token
    }   
    return token_feature
    
def form_data(reviews):
    final_feat = []
    for rev in reviews:
        features = []
        for token_index, token_pair in enumerate(rev):
            sentence = [i[0] for i in rev]
            token, pos_lab = token_pair
            features.append(get_feature(token, token_index, sentence, pos_lab))
        final_feat.append(features)
    return final_feat
            

In [28]:
X = form_data(df.tokens)

In [29]:
X[1]

[{'token': 'although',
  'pos_lab': 'IN',
  'is_first': True,
  'is_last': False,
  'is_capitalized': True,
  'is_all_caps': False,
  'is_numeric': False,
  'prefix-1': 'a',
  'prefix-2': 'a',
  'suffix-1': 'h',
  'suffix-2': 'gh',
  'prev-token': '',
  '2-prev-token': 'b',
  'next-token': 'm',
  '2-next-token': 'o'},
 {'token': 'much',
  'pos_lab': 'RB',
  'is_first': False,
  'is_last': False,
  'is_capitalized': False,
  'is_all_caps': False,
  'is_numeric': False,
  'prefix-1': 'm',
  'prefix-2': 'm',
  'suffix-1': 'h',
  'suffix-2': 'ch',
  'prev-token': 'although',
  '2-prev-token': 'a',
  'next-token': 'o',
  '2-next-token': 'o'},
 {'token': 'overpriced',
  'pos_lab': 'VBN',
  'is_first': False,
  'is_last': False,
  'is_capitalized': False,
  'is_all_caps': False,
  'is_numeric': False,
  'prefix-1': 'o',
  'prefix-2': 'o',
  'suffix-1': 'd',
  'suffix-2': 'ed',
  'prev-token': 'much',
  '2-prev-token': 'a',
  'next-token': 'o',
  '2-next-token': 'h'},
 {'token': 'opinion',
  '