# Pre-processing
Notebook to explore the pre-processing of the data.

In [6]:
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
# import nltk # To download
# nltk.download('stopwords') # For stop words
# nltk.download('wordnet') # For POS 
# nltk.download('averaged_perceptron_tagger') # For POS tagging
import re
stop_words = set(stopwords.words('english'))
stop_words.add('')
stop_words.remove('no')
stop_words.remove('not')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adelliinaa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
data = pd.read_csv('../data/stanfordSentimentTreebank/dictionary.txt', sep='|', index_col=1)
train_test_split = pd.read_csv('../data/stanfordSentimentTreebank/sentiment_labels.txt', sep='|', index_col=0)

#data.rename(columns={0: 'phrase_ids', 1: 'phrase'})
data.columns = ['phrase_tokens']
data.index.names = ['phrase_id']
train_test_split.columns = ['sentiment_values']
train_test_split.index.names = ['phrase_id']

print(data.head())
print(train_test_split.head())


           phrase_tokens
phrase_id               
22935                ! '
18235               ! ''
179257            ! Alas
22936        ! Brilliant
40532      ! Brilliant !
           sentiment_values
phrase_id                  
0                   0.50000
1                   0.50000
2                   0.44444
3                   0.50000
4                   0.42708


In [102]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

def get_pos_from_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def pre_processing(collection):
    # Case-fold
    collection = collection.lower()
    
    # Tokenising, converts all non-chars, new-lines and tabs to ' '
    collection = list(filter(None, re.sub(r'[\W]', ' ',collection).split(' ')))
    
    collection = pos_tag(collection)
    
    # Stemming and Stopping 
    return [lemmatizer.lemmatize(term[0], pos = get_pos_from_tag(term[1])) for term in collection if term[0] not in stop_words] 

#pre_processing("This is a test sentence I love this film it's good. better than the last one, greater than the second one it was playing plays played")


In [56]:
# Apply pre_processing to data

data['phrase_tokens'] = data['phrase_tokens'].apply(pre_processing)

In [92]:
# Remove empty phrases 

for index, row in data.iterrows():
    if len(row[0]) == 0:
        data = data.drop([index], axis=0)

In [104]:
# Inner Join on phrase_id

labeled_phrases = pd.merge(data, train_test_split, left_index=True, right_index=True)
labeled_phrases = labeled_phrases.sort_index(ascending=True)

In [109]:
labeled_phrases.to_csv('labeled_phrases.csv', index=True)
labeled_phrases.head()

Unnamed: 0_level_0,phrase_tokens,sentiment_values
phrase_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,[cockettes],0.5
4,[cockettes],0.42708
5,"[cockettes, provide, window, subculture, hell,...",0.375
6,"[cockettes, provide, window, subculture, hell,...",0.41667
7,"[cockettes, provide, window, subculture, hell,...",0.54167
