In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
# pd.set_option('display.max_colwidth', -1)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deban\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
train_data=pd.read_csv("train.csv",sep="~",usecols=['Description','Is_Response'])
train_data.head()

Unnamed: 0,Description,Is_Response
0,After reading mixed reviews I almost didn't bo...,Good
1,This motor inn is located about - city blocks ...,Good
2,It was our first time there and surely not our...,Good
3,"Great hotel in an excellent location, just off...",Good
4,We stayed at the hotel for - weeks to get away...,Good


In [3]:
#Convert each sentence to string
from string import punctuation
from nltk.corpus import stopwords

def sentence_to_words(sentence):
    
    sentence = sentence.lower()
    
    sentence = ''.join([c for c in sentence if c not in punctuation])    
    sentence_split = sentence.split('\n')
    sentence = ' '.join(sentence_split)
    words = sentence.split()
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    return words

In [4]:
import pickle
def cleanup_data(data,cache_file="preprocessed_data.pkl"):
    cache_data=None
    if cache_file is not None:
        try:
            with open(cache_file,'rb') as f:
                cache_data=pickle.load(f)
            print("Read cache data")
        except:
            pass
        
    #Create cache if not present
    if cache_data is None:
        words_train=[sentence_to_words(sentence) for sentence in data]
        #Write to cache file
        if cache_file is not None:
            cache_data=words_train
            with open(cache_file, "wb") as f:
                pickle.dump(cache_data,f)
            print("Wrote preprocessed data to: ",cache_file)
    else:
        words_train=cache_data
    return words_train
        

In [5]:
train_data.Description=cleanup_data(train_data.Description)

Read cache data


In [6]:
# train_data.Description

In [7]:
import numpy as np
from collections import Counter

def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    flattened_data = [y for x in data for y in x]
    word_count = {} # A dict storing the words that appear in the reviews along with how often they occur
    word_count=Counter(flattened_data)
    sorted_words = [word for word, _ in word_count.most_common()]
    
    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size-2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [8]:
word_dict=build_dict(train_data.Description)

In [9]:
word_dict

{'hotel': 2,
 'room': 3,
 'stay': 4,
 'great': 5,
 'staff': 6,
 'would': 7,
 'rooms': 8,
 'good': 9,
 'one': 10,
 'location': 11,
 'nice': 12,
 'stayed': 13,
 'us': 14,
 'clean': 15,
 'night': 16,
 'service': 17,
 'breakfast': 18,
 'get': 19,
 'time': 20,
 'also': 21,
 'desk': 22,
 'like': 23,
 'bed': 24,
 'friendly': 25,
 'could': 26,
 'area': 27,
 'day': 28,
 'well': 29,
 'place': 30,
 'comfortable': 31,
 'front': 32,
 'small': 33,
 'really': 34,
 'back': 35,
 'free': 36,
 'bathroom': 37,
 'floor': 38,
 'even': 39,
 'helpful': 40,
 'didnt': 41,
 'two': 42,
 'next': 43,
 'hotels': 44,
 'walk': 45,
 'lobby': 46,
 'got': 47,
 'parking': 48,
 'little': 49,
 'go': 50,
 'new': 51,
 'first': 52,
 'right': 53,
 'street': 54,
 'much': 55,
 'around': 56,
 'view': 57,
 'price': 58,
 'th': 59,
 'close': 60,
 'nights': 61,
 'restaurant': 62,
 'city': 63,
 'times': 64,
 'people': 65,
 'pool': 66,
 'food': 67,
 'door': 68,
 'check': 69,
 'recommend': 70,
 'excellent': 71,
 'away': 72,
 'bar': 73,
 

### Create word dictionary

In [10]:
def convert_and_pad(word_dict, sentence, pad=200):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence

def convert_and_pad_data(word_dict, data, pad=200):
    result = []
      
    for sentence in data:
        converted= convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        
        
    return result

In [11]:
train_data.Description=convert_and_pad_data(word_dict,train_data.Description)

In [12]:
train_data.head()

Unnamed: 0,Description,Is_Response
0,"[657, 1518, 173, 375, 41, 380, 471, 1155, 2053...",Good
1,"[3007, 177, 134, 63, 172, 894, 3983, 172, 3065...",Good
2,"[52, 20, 2408, 187, 133, 197, 589, 82, 1610, 3...",Good
3,"[5, 2, 71, 11, 64, 89, 43, 68, 1, 1205, 263, 1...",Good
4,"[13, 2, 873, 19, 72, 153, 1, 2066, 2, 1387, 1,...",Good
