# Dataset Transformation

In [2]:
import re
import numpy as np
import pandas as pd

seed = 123

In [2]:
categories  = ['apparel', 'automotive', 'baby', 'beauty', 'books', 'camera_&_photo', 'cell_phones_&_service',
               'computer_&_video_games', 'dvd', 'electronics', 'gourmet_food', 'grocery', 'health_&_personal_care',
               'jewelry_&_watches', 'kitchen_&_housewares', 'magazines', 'music', 'musical_instruments', 'office_products',
               'outdoor_living', 'software', 'sports_&_outdoors', 'tools_&_hardware', 'toys_&_games', 'video']

### 1. Cleaning raw data

In [None]:
# To start process the data from scratch, you need to set is_from_raw_amaxon_dataset = True 
# download and unzip unprocessed.tar.gz  from https://www.cs.jhu.edu/~mdredze/datasets/sentiment/
# put the unzipped folders into data/data-raw/

is_from_raw_amaxon_dataset = False

if is_from_raw_amaxon_dataset:
    # empty dataframe for saving counts on categories
    columns=['category', 'all_review_num', 'neg_review_num', 'neg_review_selectivity']
    df_counts = pd.DataFrame([], columns=columns)
    
    for category in categories:

        with open('data/data-raw/{}/all.review'.format(category), 'r', encoding = "ISO-8859-1") as myfile:
            data=myfile.read().replace('\n', '')

        reviews = re.findall("<review_text>(.*?)</review_text>", data)
        ratings = re.findall("<rating>(.*?)</rating>", data)

        df = pd.DataFrame(list(zip(reviews, ratings)), columns=['text', 'star']).drop_duplicates(subset=['text'])
        # Keep reviews with 1, 2, 5 stars
        df = df.loc[df['star'].isin(['1.0', '5.0'])]
        df['is_negative'] = df['star'].map({'1.0': 1, '5.0': 0})

        print('{} | rev_num: {}, neg_rev_num: {}, neg_rev_selectivity: {}'.
              format(category, df.shape[0], sum(df['is_negative']), sum(df['is_negative'])/ df.shape[0]))

        # add current category counts to df2
        df_temp = pd.DataFrame([[category, df.shape[0], sum(df['is_negative']),  sum(df['is_negative'])/ df.shape[0]]], columns=columns)
        df_counts = df_counts.append(df_temp)

#         # save clean data to csv
#         df.to_csv('data/clean/{}.csv'.format(category), index=False)
        
#     df_counts.to_csv('data/category_counts.csv', index=False)

Books selectivity = 0.61 <br/>
Negative review selectivity = 0.10 <br/>
Proportion of IN-scope reviews = 0.05 <br/>

### 2. Making a dataset of size N from the cleaned dataset  (see "1. Cleaning raw data")

In [111]:
from sklearn.model_selection import StratifiedShuffleSplit


#   do stratified sampling of reviews, so keeping the same proportion of positive 
#   and negatives reviews as in the original dataset
def do_stratified_sample(df_category, reviews_to_sample):
    indexs, y = df_category.index.values, df_category['is_negative'].values
    sss = StratifiedShuffleSplit(n_splits=1, test_size=reviews_to_sample, random_state=seed)
    for _, index_ in sss.split(indexs, y):
        df_category_index = indexs[index_]
    
    return df_category_index

N = 100000  # size of new dataset

df_counts = pd.read_csv('data/category_counts.csv')
df_counts['reviews_to_sample'] = N * df_counts['all_review_num'] / sum(df_counts['all_review_num'].values)
df_counts['reviews_to_sample'] = df_counts['reviews_to_sample'].apply(lambda x: round(x))

# round reviews_to_sample to be equal to N in total
if sum(df_counts['reviews_to_sample'].values) < N:
    add_num = N - sum(df_counts['reviews_to_sample'].values)
    book_ind = df_counts.index[df_counts['category'] == 'books'][0]
    df_counts.loc[book_ind, 'reviews_to_sample'] = df_counts.loc[book_ind, 'reviews_to_sample'] + add_num

# empty dataframe for future dataset in size of N
columns_dataset_n = ['text', 'is_negative', 'is_book', 'Y', 'category']
df_dataset_n = pd.DataFrame([], columns=columns_dataset_n)

for category in categories:
    df_category = pd.read_csv('data/clean/{}.csv'.format(category))[['text', 'is_negative']]
    reviews_to_sample = df_counts[df_counts['category'] == category]['reviews_to_sample'].values[0]
    
#     check if reviews_to_sample == 1, than take the first review as StratifiedShuffleSplit will raise ValueError
    if reviews_to_sample > 1:
        df_category_index = do_stratified_sample(df_category, reviews_to_sample)
    elif reviews_to_sample == 1:
        df_category_index = [1]
    else:
        continue
    
#   add additional columns 'is_book', 'Y', and 'category'
    df_to_add = df_category.loc[df_category_index]
    if category == 'books':
        df_to_add['is_book'] = 1
        df_to_add['Y'] = df_to_add.apply(lambda row: 1 if row['is_negative']+row['is_book'] == 2 else 0, axis=1)
    else:
        df_to_add['is_book'] = 0
        df_to_add['Y'] = 0
    df_to_add['category'] = category
    
#   append current cutegory dataframe to df_dataset_n
    df_dataset_n = df_dataset_n.append(df_to_add)

# save dataframe to csv file
df_dataset_n.to_csv('data/clean-sized/{}_reviews.csv'.format(N), index=False)

### 3. Stemming, Lemmatising, finding phrases the AMAZON reviews

In [None]:
from gensim.models import Phrases
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import nltk
import string
import re
from collections import Counter

In [29]:
df = pd.read_csv('data/clean-sized/100000_reviews.csv')
text_cleaned = []

# Replace all numbers with special strings
regx = re.compile(r"\b[\d.]+\b")
porter = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

for _, row in df.iterrows():
    text = row['text']
#     with stemming
    text = [porter.stem(word.strip()) for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
    
#     # without stemming
#     text = [word.strip() for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
    
#     # with lemmatizer
#     text = [wordnet_lemmatizer.lemmatize(word.strip()) for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
         
    text_cleaned.append(text)
    
# Findining Phrases (ie bi-grams)
# train bi-grams
bigram = Phrases()
bigram.add_vocab(text_cleaned)

# create phrases
text_cleaned_phrases = []
for text_ in text_cleaned:
    text_cleaned_phrases.append(bigram[text_])

text_cleaned_phrases_joined = [' '.join(text) for text in text_cleaned_phrases]
df['text'] = pd.Series(text_cleaned_phrases_joined, index=df.index)

df.to_csv('100000_reviews_stemmed.csv', index=False)

