<h1>text feature extraction</h1>

In [1]:
# make imports

import os
import sys
import string
import csv
import numpy as np
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import math

In [2]:
# download the nltk punkt corpus for tokenization
nltk.download('punkt')

# download the stopword corpus to get rid of stopwords later
nltk.download('stopwords')

# download wordnet lemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/mo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# function to read csv files

def read_csv(file_name):
    
    # check if the file exists
    if os.path.exists(file_name):

        # open the file
        with open(file_name, 'r') as file:

            # read the file
            reader = csv.reader(file)

            # return the data
            return list(reader)
        
    return None

# read the csv file
num_image_review = read_csv('A2_Data.csv')

# remove the first row (fieldnames)
num_image_review.pop(0)

['', 'Image', 'Review Text']

In [4]:
# make a dict for (prod_id : review)

prod_reviews = dict()
corrupted_prod_ids = ['2912', '2235', '2088', '3474', '2265', '3317']

for ele in num_image_review:
    prod_id = ele[0]
    prod_review = ele[2]

    # check if product_id is corrupted
    if prod_id in corrupted_prod_ids:
        continue
    else: # add to prod reviews dictionary
        prod_reviews[int(prod_id)] = prod_review

In [5]:
# apply preprocessing techniques to the review text

print(prod_reviews[100])

#use regex to look for the given patterns and remove them 
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+') 
    return re.sub(pattern, r'', text)

def remove_html(text):
    pattern = re.compile('<.*?>')
    return re.sub(pattern, r'', text)

def preprocess(text):

    # get rid of links
    text = remove_url(text)

    # get rid of html classes
    text = remove_html(text)

    # remove punctuation
    text = "".join([i for i in text if i not in string.punctuation])

    # make it lowercase
    text = text.lower()

    # tokenize the text
    text = word_tokenize(text)

    # remove stopwords
    stop_words = set(stopwords.words('english'))

    text_wo_sw = []
    for word in text:
        if word not in stop_words:
            text_wo_sw.append(word)

    text = text_wo_sw

    # apply lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    text = [wordnet_lemmatizer.lemmatize(word) for word in text]

    # apply stemming
    porter_stemmer = PorterStemmer()
    text = [porter_stemmer.stem(word) for word in text]

    return text

for id, review in prod_reviews.items():
    prod_reviews[id] = preprocess(review)

print(prod_reviews[100])

aloha from hawaii!  it's 2015 and after 58 years, my '57 Fender P-Bass has a new case! the Fender P-Bass fits like a charm!
['aloha', 'hawaii', '2015', '58', 'year', '57', 'fender', 'pbass', 'new', 'case', 'fender', 'pbass', 'fit', 'like', 'charm']


In [6]:
# build tf-idf

# find out the number of documents
num_documents = len(prod_reviews.keys())

# assign an index to each word
word_to_idx = dict()
idx_to_word = dict()

for prod_id, prod_review in prod_reviews.items():
    for word in prod_review:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)

# find out number of unique words
num_unique_words = len(word_to_idx)

for word, idx in word_to_idx.items():
    idx_to_word[idx] = word

In [7]:
# assign an index to each doc_id
doc_to_idx  = dict()
idx_to_doc = dict() 

for prod_id in prod_reviews.keys():
    if prod_id not in doc_to_idx:
        doc_to_idx[prod_id]  =len(doc_to_idx)

for doc_id, idx in doc_to_idx.items():
    idx_to_doc[idx] = doc_id

In [8]:
# make a term frequency matrix
tf_mat = np.zeros((num_unique_words, num_documents), dtype=float)

# Term frequency, tf(t,d), is the relative frequency of term t within document d

# fill frequency values
for prod_id, prod_review in prod_reviews.items():
    for word in prod_review:
        tf_mat[word_to_idx[word]][doc_to_idx[prod_id]] += 1

# divide tf[word][document] by sum of the column tf[document]
        
for j in range(num_documents):
    sum_freq = 0
    for i in range(num_unique_words):
        sum_freq += tf_mat[i][j]

    # handle division by zero error
    if (sum_freq == 0):
        continue

    for i in range(num_unique_words):
        tf_mat[i][j] /= float(sum_freq)

print(tf_mat)

[[0.06666667 0.         0.         ... 0.         0.         0.        ]
 [0.13333333 0.         0.         ... 0.         0.         0.        ]
 [0.13333333 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.04166667 0.        ]
 [0.         0.         0.         ... 0.         0.04166667 0.        ]
 [0.         0.         0.         ... 0.         0.04166667 0.        ]]


In [9]:
# calculate idf for each word
idf_dict = dict()

for word in word_to_idx.keys():

    # initialize doc count to 0
    idf_dict[word_to_idx[word]] = 0

    for prod_id, prod_review in prod_reviews.items():
        if word in prod_review:
            idf_dict[word_to_idx[word]] += 1 #incerement document count

# calculate the idf values
for idx, doc_count in idf_dict.items():
    idf_val = (1 + math.log( (num_documents) / (1+doc_count)  ))
    idf_dict[idx] = idf_val

print(idf_dict)

{0: 3.073423469354273, 1: 4.957298227490133, 2: 5.955827057601261, 3: 4.117547572738313, 4: 2.578727227518166, 5: 5.704512629320354, 6: 2.133416210862802, 7: 5.704512629320354, 8: 6.5154428455366835, 9: 4.34638914516716, 10: 3.156805078293324, 11: 3.597672113452404, 12: 3.0038974067056627, 13: 2.4636578977333787, 14: 2.2598301357184605, 15: 6.803124917988464, 16: 6.803124917988464, 17: 5.599152113662528, 18: 3.55793178480289, 19: 6.5154428455366835, 20: 3.7428541232969024, 21: 4.436001303856847, 22: 3.049706942736957, 23: 6.109977737428519, 24: 7.208590026096629, 25: 3.156805078293324, 26: 4.857214768933151, 27: 3.653241964607215, 28: 4.34638914516716, 29: 5.704512629320354, 30: 3.790863342483263, 31: 3.8763855159214247, 32: 2.127185661112166, 33: 4.1640675883732055, 34: 4.095074716886254, 35: 3.7120824646301487, 36: 4.605900340652245, 37: 3.9504934880751468, 38: 4.723683376308628, 39: 4.56953269648137, 40: 5.336787849195037, 41: 5.193687005554364, 42: 5.129148484416793, 43: 5.33678784

In [10]:
# build the tf-idf matrix
tfidf_mat = np.zeros((num_unique_words, num_documents), dtype=float)

for i in range(num_unique_words):
    for j in range(num_documents):
        tfidf_mat[i][j] = tf_mat[i][j]*idf_dict[i]


print(tfidf_mat)

[[0.2048949  0.         0.         ... 0.         0.         0.        ]
 [0.6609731  0.         0.         ... 0.         0.         0.        ]
 [0.79411027 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.30035792 0.        ]
 [0.         0.         0.         ... 0.         0.30035792 0.        ]
 [0.         0.         0.         ... 0.         0.30035792 0.        ]]


In [12]:
# pickle the tfidf_mat, tf_mat, word_to_idx dict, idx_to_word dict, idx_to_doc dict

import pickle

with open('tfidf_mat.pkl', 'wb') as f:
    pickle.dump(tfidf_mat, f)

with open('tf_mat.pkl', 'wb') as f:
    pickle.dump(tf_mat, f)

with open('word_to_idx.pkl', 'wb') as f:
    pickle.dump(word_to_idx, f)

with open('idx_to_word.pkl', 'wb') as f:
    pickle.dump(idx_to_word, f)

with open('idx_to_doc.pkl', 'wb') as f:
    pickle.dump(idx_to_doc, f)

with open('idf_dict.pkl', 'wb') as f:
    pickle.dump(idf_dict, f)