In [69]:
import os
import glob

import nltk
import string

import gzip

In [None]:
# load data from datasets

data_categories = ['books', 'dvd', 'electronics', 'kitchen_&_housewares']
review_categories = ['positive', 'negative', 'unlabeled']

data_dir = "sorted_data_acl"

# load data
def review_data_load():
    data_map = {data_category: {} for data_category in data_categories}
    
    for data_category in data_categories:
        for review_category in review_categories:
            file_path = os.path.join(data_dir, data_category, f"{review_category}.review")
            
            if os.path.exists(file_path):               
                try:
                    with open(file_path, 'r') as file:
                        data_map[data_category][review_category] = file.read()
                except UnicodeDecodeError as e:
                    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
                        data_map[data_category][review_category] = file.read()
            else:
                data_map[data_category][review_category] = None
    
    return data_map

data_map = review_data_load()

#check data
def check_loaded_data():
    for data_category, review_categories_data in data_map.items():
        for review, review_category_data in review_categories_data.items():
            print(f"{data_category} has {review} type {type(review_category_data)} size {len(review_category_data) if review_category_data else 0}")

check_loaded_data()


In [None]:
# load xml nodes from files to data structures
import xml.etree.ElementTree as ET
import re

from fractions import Fraction
from enum import Enum

class Review:
    def __init__(self, rating, review_text, helpfull, title):
        self.rating = rating
        self.review_text = review_text
        self.title = title

        if helpfull is None or " of " not in helpfull:
            self.helpfull = None
        else:
            numerator, denominator = helpfull.split(" of ")
            numerator = int(numerator)
            denominator = int(denominator)
            self.helpfull = Fraction(int(numerator), int(denominator))
    
    def __repr__(self):
        return f"Review(rating={self.rating}, helpfull={self.helpfull if self.helpfull is not None else ""}, title={self.title}, review_text={self.review_text})"

def wrap_with_root(xml_string):
    return f"<root>{xml_string}</root>"

def parse_review_XML_text_to_structures(reviews_text):
    wrapped_text = wrap_with_root(reviews_text)
    try:
        root = ET.fromstring(wrapped_text)
    except Exception as e:
        print(e)
        print(wrapped_text[57162753:57162793])
    
    reviews = []
    
    print(f"Number of child elements: {len(root)}")
    
    for review in root.findall('review'):
        helpful = review.find('helpful').text
        rating = int(float(review.find('rating').text))
        title = review.find('title').text
        review_text = review.find('review_text').text

        reviews.append(Review(rating, review_text, helpful, title))

    print(f"It {len(reviews_text)} parsed to {len(reviews)} reviews")
    return reviews

def parse_data_to_reviews(data_map):
    reviews_map = {}
    
    for data_category, review_categories_data in data_map.items():
        for review_category, review_category_data in review_categories_data.items():
            print(f"Parse {review_category} {data_category}")
            review_category_data = re.sub(r'[\x00-\x1F\x7F]', '', review_category_data)

            outer = reviews_map.get(data_category, {})
            outer[review_category] = parse_review_XML_text_to_structures(review_category_data.replace('&', '&amp;'))
            reviews_map[data_category] = outer
    
    return reviews_map

reviews_map = parse_data_to_reviews(data_map)

#check data
def check_parsed_data():
    for data_category, review_categories_data in reviews_map.items():
        for review_category, review_category_data in review_categories_data.items():
            print(f"{data_category} has {review_category} type {type(review_category_data)} size {len(review_category_data)}")
            print(review_category_data[:2])
            print(review_category_data[-2:])

check_parsed_data()

In [78]:
# Check spells, remove special characters, expanding contractions etc...
import contractions

from spellchecker import SpellChecker
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob

# pip install spellchecker, pyspellchecker
# python3 -m nltk.downloader all
#nltk.download('wordnet')
#nltk.download('omw-1.x')
#nltk.download('stopwords')
#nltk.download('punkt')

spell = SpellChecker()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

to_remove_punctuation = True
to_convert_to_lowercase = True
to_remove_numbers = False
to_remove_extra_whitespace = True
to_correct_spelling = True
to_remove_stop_words = True
to_stem_words = False
to_lemmatize_words = True
to_remove_specs = True
to_expand_contractions = True

# Punctuation often doesn't add much meaning for many NLP tasks and can be removed.
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

#Converting all text to lowercase ensures that words are treated uniformly.
def convert_to_lowercase(text):
    return text.lower()

# In some cases, numbers might not be relevant.
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Extra whitespace and newline characters should be removed or normalized.
def remove_extra_whitespace(text):
    return ' '.join(text.split())

# Stop words (common words that don’t add much meaning) can be removed.
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def correct_spelling(text): # need to improve performance
    words = text.split()
    corrected_words = []

    for word in words:
        correct_word = spell.candidates(word)
        corrected_words.append(word if correct_word is None else list(correct_word)[0])
    
    #print(corrected_words)
    return ' '.join(corrected_words)

def correct_spelling2(text): # need to improve performance
    b = TextBlob(text)
    return b.correct()

# Stemming reduces words to their root form.
def stem_words(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

# Lemmatization reduces words to their base or dictionary form, usually providing better results than stemming.
def lemmatize_words(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# remove URLs
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# remove emails
def remove_emails(text):
    return re.sub(r'\b[\w.%+-]+@[\w.-]+\.[a-zA-Z]{2,6}\b', '', text)

# remove special characters
def remove_special_characters(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_specs(text):
    return remove_special_characters(remove_emails(remove_urls(text)))

# expanding contractions to full forms for more consistent analysis
def expand_contractions(text):
    return contractions.fix(text)

clean_function_list = [
    (to_remove_extra_whitespace, remove_extra_whitespace),
    (to_remove_punctuation, remove_punctuation),
    (to_correct_spelling, correct_spelling2),
    (to_expand_contractions, expand_contractions),
    (to_remove_stop_words, remove_stop_words),
    (to_stem_words, stem_words),
    (to_lemmatize_words, lemmatize_words),
    (to_convert_to_lowercase, convert_to_lowercase),
    (to_remove_numbers, remove_numbers),
    (to_remove_specs, remove_specs)
]

def clean_data(text):
    for to_do_clean, clean_func in clean_function_list:
        if not text:
            return ''
        if to_do_clean:
            text = clean_func(text)

    return text

reviews_map_cleaned = reviews_map.copy()

def clean_reviews():
    for data_category, review_categories_data in reviews_map_cleaned.items():
        for review_category, review_category_data in review_categories_data.items():
            print(f"{data_category} {review_category} started")
            for review in review_category_data:
                review.review_text = clean_data(review.review_text)
                review.title = clean_data(review.title)
            print(f"{data_category} {review_category} finished")

clean_reviews()


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alex937/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/alex937/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


books positive started
books positive finished
books negative started
books negative finished
books unlabeled started
books unlabeled finished
dvd positive started
dvd positive finished
dvd negative started
dvd negative finished
dvd unlabeled started
dvd unlabeled finished
electronics positive started
electronics positive finished
electronics negative started
electronics negative finished
electronics unlabeled started
electronics unlabeled finished
kitchen_&_housewares positive started
kitchen_&_housewares positive finished
kitchen_&_housewares negative started
kitchen_&_housewares negative finished
kitchen_&_housewares unlabeled started
kitchen_&_housewares unlabeled finished


In [None]:
def check_clean_data():
    for data_category, review_categories_data in reviews_map_cleaned.items():
        for review_category, review_category_data in review_categories_data.items():
            print(f"{data_category} has {review_category} type {type(review_category_data)} size {len(review_category_data)}")
            print(review_category_data[:2])
            print(review_category_data[-2:])

check_clean_data()

In [82]:
# prepare Pandas dataset
import pandas as pd

from nltk.corpus import words
nltk.download('words')

valid_words = set(words.words())

# to check if a review text has a sufficient number of valid words
def is_meaningful_review(review_text, min_valid_words=5):
    word_list = review_text.split()
    valid_word_count = sum(1 for word in word_list if word in valid_words)
    return valid_word_count >= min_valid_words  

# to estimate review
class ReviewCategory(Enum):
    POS = 1
    NEG = 2
    UNDEF = 3
    
def mark_review(review, review_category):
    if review_category == 'positive':
        return ReviewCategory.POS
    if review_category == 'negative':
        return ReviewCategory.NEG

    if review.rating > 3:
        return ReviewCategory.POS
    if review.rating < 3:
        return ReviewCategory.NEG
    
    return ReviewCategory.UNDEF

def prepare_data_to_pandas():
    texts = []
    labels = []

    # to removes duplicate reviews 
    seen = set() 

    for data_category, review_categories_data in reviews_map_cleaned.items():
        for review_category, review_category_data in review_categories_data.items():
            for review in review_category_data:
                if not review.title:
                    if not review.review_text:
                        continue
                    full_review_text = review.review_text
                if not review.review_text:
                    full_review_text = review.title
                else:
                    full_review_text = f"{review.title} {review.review_text}"
                
                # check correctnesses
                if is_meaningful_review(full_review_text) == False:
                    continue
                
                # check duplicates
                if full_review_text in seen:
                    continue
                else:
                    seen.add(full_review_text)
    
                # check clean estimating
                review_mark = mark_review(review, review_category)
                if(review_mark == ReviewCategory.UNDEF):
                    continue
                
                texts.append(full_review_text)
                labels.append(1 if review_mark == ReviewCategory.POS else 0)
    
    return {'text': texts, 'label': labels}

data_to_pandas = prepare_data_to_pandas()
df = pd.DataFrame(data_to_pandas)

df.to_csv('reviews_dataset.csv', index=False)

[nltk_data] Downloading package words to /home/alex937/nltk_data...
[nltk_data]   Package words is already up-to-date!
