# Pre-processing
Notebook to explore the pre-processing of the data.

In [85]:
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
# import nltk # To download
# nltk.download('stopwords') # For stop words
# nltk.download('wordnet') # For POS 
# nltk.download('averaged_perceptron_tagger') # For POS tagging
from nltk.corpus import stopwords 
import re
stop_words = set(stopwords.words('english'))
stop_words.add('')
stop_words.remove('no')
stop_words.remove('not')

In [104]:
data = pd.read_csv('../data/stanfordSentimentTreebank/datasetSentences.txt', sep='\t', index_col='sentence_index')
train_test_split = pd.read_csv('../data/stanfordSentimentTreebank/datasetSplit.txt', index_col='sentence_index')
print(data.head())
print(train_test_split.head())

                                                         sentence
sentence_index                                                   
1               The Rock is destined to be the 21st Century 's...
2               The gorgeously elaborate continuation of `` Th...
3                                  Effective but too-tepid biopic
4               If you sometimes like to go to the movies to h...
5               Emerges as something rare , an issue movie tha...
                splitset_label
sentence_index                
1                            1
2                            1
3                            2
4                            2
5                            2


In [97]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

def get_pos_from_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def pre_processing(collection):
    # Case-fold
    collection = collection.lower()
    
    # Tokenising, converts all non-chars, new-lines and tabs to ' '
    collection = list(filter(None, re.sub(r'[\W]', ' ',collection).split(' ')))
    
    collection = pos_tag(collection)
    
    # Stemming and Stopping 
    return [lemmatizer.lemmatize(term[0], pos = get_pos_from_tag(term[1])) for term in collection if term[0] not in stop_words] 

pre_processing("This is a test sentence I love this film it's good. better than the last one, greater than the second one it was playing plays played")


['test',
 'sentence',
 'love',
 'film',
 'good',
 'good',
 'last',
 'one',
 'great',
 'second',
 'one',
 'play',
 'play',
 'play']

In [89]:
# Apply pre_processing to data

data['sentence'] = data['sentence'].apply(pre_processing)

In [90]:
data.head()

Unnamed: 0_level_0,sentence
sentence_index,Unnamed: 1_level_1
1,"[rock, destine, 21st, century, new, conan, go,..."
2,"[gorgeously, elaborate, continuation, lord, ri..."
3,"[effective, tepid, biopic]"
4,"[sometimes, like, go, movie, fun, wasabi, good..."
5,"[emerges, something, rare, issue, movie, hones..."
