# Transform crowdsourced data

In [6]:
import re
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('data/1k_amazon_reviews_crowdsourced.csv')
# cut out test questions
df = df.loc[df['_golden'] == False]

In [8]:
item_ids_fig8 = df['_unit_id'].unique()
item_ids_dict = dict(zip(item_ids_fig8, np.arange(len(item_ids_fig8))))

data_list = []
for id_fig8 in item_ids_fig8:
    df_item = df.loc[df['_unit_id'] == id_fig8]
    data_item = [item_ids_dict[id_fig8]]
    is_book_in, is_book_out, is_negative_in, is_negative_out = 0, 0, 0, 0
    for _, row in df_item.iterrows():
        if row['is_book_crowd'] != 'not_sure':
            if row['is_book_crowd'] == '1':
                is_book_in += 1
            else:
                is_book_out += 1
        if row['is_negative_crowd'] != 'not_sure':
            if row['is_negative_crowd'] == '1':
                is_negative_in += 1
            else:
                is_negative_out += 1
    y = 1 if (row['is_book'] == 1 and row['is_negative'] == 1) else 0
    data_item += [is_book_in, is_book_out, is_negative_in, is_negative_out, row['is_book'], row['is_negative'], y, row['text']]
    data_list.append(data_item)
    
df_data = pd.DataFrame(data_list, columns=['item_id', 'is_book_in', 'is_book_out', 'is_negative_in',
                                             'is_negative_out', 'is_book', 'is_negative', 'Y', 'text'])

# uncomment if need to save the dataframe as csv
# df_data.to_csv('data/1k_amazon_reviews_crowdsourced_transformed.csv', index=False)

# Stemming, Lemmatising, finding phrases on the AMAZON reviews

In [9]:
from gensim.models import Phrases
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import nltk
import string
import re
from collections import Counter

In [11]:
df = pd.read_csv('data/1k_amazon_reviews_crowdsourced_transformed.csv')
text_cleaned = []

# Replace all numbers with special strings
regx = re.compile(r"\b[\d.]+\b")
porter = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

for _, row in df.iterrows():
    text = row['text']
#     with stemming
#     text = [porter.stem(word.strip()) for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
    
#     # without stemming
#     text = [word.strip() for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
    
#     # with lemmatizer
    text = [wordnet_lemmatizer.lemmatize(word.strip()) for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
         
    text_cleaned.append(text)
    
# Findining Phrases (ie bi-grams)
# train bi-grams
bigram = Phrases()
bigram.add_vocab(text_cleaned)

# create phrases
text_cleaned_phrases = []
for text_ in text_cleaned:
    text_cleaned_phrases.append(bigram[text_])

text_cleaned_phrases_joined = [' '.join(text) for text in text_cleaned_phrases]
df['text'] = pd.Series(text_cleaned_phrases_joined, index=df.index)

# uncomment if need to save the dataframe as csv
# df.to_csv('data/1k_amazon_reviews_crowdsourced_lemmatized.csv', index=False)

