<h1>text feature extraction</h1>

In [1]:
# make imports

import os
import sys
import string
import csv
import numpy as np
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [2]:
# download the nltk punkt corpus for tokenization
nltk.download('punkt')

# download the stopword corpus to get rid of stopwords later
nltk.download('stopwords')

# download wordnet lemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/mo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# function to read csv files

def read_csv(file_name):
    
    # check if the file exists
    if os.path.exists(file_name):

        # open the file
        with open(file_name, 'r') as file:

            # read the file
            reader = csv.reader(file)

            # return the data
            return list(reader)
        
    return None

# read the csv file
num_image_review = read_csv('A2_Data.csv')

# remove the first row (fieldnames)
num_image_review.pop(0)

['', 'Image', 'Review Text']

In [4]:
# make a dict for (prod_id : review)

prod_reviews = dict()
corrupted_prod_ids = ['2912', '2235', '2088', '3474', '2265', '3317']

for ele in num_image_review:
    prod_id = ele[0]
    prod_review = ele[2]

    # check if product_id is corrupted
    if prod_id in corrupted_prod_ids:
        continue
    else: # add to prod reviews dictionary
        prod_reviews[int(prod_id)] = prod_review

In [5]:
# apply preprocessing techniques to the review text

print(prod_reviews[100])

#use regex to look for the given patterns and remove them 
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+') 
    return re.sub(pattern, r'', text)

def remove_html(text):
    pattern = re.compile('<.*?>')
    return re.sub(pattern, r'', text)

def preprocess(text):

    # get rid of links
    text = remove_url(text)

    # get rid of html classes
    text = remove_html(text)

    # remove punctuation
    text = "".join([i for i in text if i not in string.punctuation])

    # make it lowercase
    text = text.lower()

    # tokenize the text
    text = word_tokenize(text)

    # remove stopwords
    stop_words = set(stopwords.words('english'))

    text_wo_sw = []
    for word in text:
        if word not in stop_words:
            text_wo_sw.append(word)

    text = text_wo_sw

    # apply lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    text = [wordnet_lemmatizer.lemmatize(word) for word in text]

    # apply stemming
    porter_stemmer = PorterStemmer()
    text = [porter_stemmer.stem(word) for word in text]

    return text

for id, review in prod_reviews.items():
    prod_reviews[id] = preprocess(review)

print(prod_reviews[100])

aloha from hawaii!  it's 2015 and after 58 years, my '57 Fender P-Bass has a new case! the Fender P-Bass fits like a charm!
['aloha', 'hawaii', '2015', '58', 'year', '57', 'fender', 'pbass', 'new', 'case', 'fender', 'pbass', 'fit', 'like', 'charm']


In [8]:
# build term frequency matrix

# find unique words throughout the entire corpus
unique_words = set()

for review in prod_reviews.values():
    for word in review:
        unique_words.add(word)

num_unique_words = len(unique_words)
num_docs = len(prod_reviews)
unique_words = list(unique_words)

print(num_unique_words)
print(num_docs)

5019
994
