In [13]:
import pandas as pd
import re # for regex
from string import punctuation # for preprocessing
from nltk.tokenize.casual import casual_tokenize # for tokenization
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('../data/small_sample.csv')

In [4]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,1.0,True,"12 12, 2017",A20AKRG3V2B372,B00RN08584,"{'Color:': ' Black', 'Style:': ' Printer'}",MOMTO3BOYS,miserable set up. made me want to put my fist ...,stay away,1513036800,,
1,1.0,False,"08 27, 2009",A7FWSCI4QP3J4,B001VEJ23A,,J. Hurley,The only thing you are paying for is the Dell ...,Buyer Beware: This will clean out your bank ac...,1251331200,8.0,
2,1.0,True,"03 27, 2015",A1LWZUTY6B5DWS,B0006VQBTW,,Hazel Smith,Most of these markers dried out within the fir...,Dried out very quickly,1427414400,,
3,1.0,True,"07 6, 2017",A3E1IXJ949VL29,B007ADJQBE,"{'Size:': ' 46 x 60 Inches', 'Style:': ' Recta...",JCR,Only measures 58x46 instead of 60x46. I see a...,Smaller than advertised.,1499299200,2.0,
4,1.0,False,"06 9, 2011",A3H7NPNDMGLOU4,B006K0OCUI,"{'Size:': ' 2-Count', 'Color:': ' Black'}",OK,I'll agree the ink itself is exceptionally smo...,I Wanted to Like This Pen,1307577600,,


# Cleaning up review
Here we are going to create a new column containing the cleaned up version of reviews, we are going to perform the following preprocessing right now
- Creating a new column
- Removing punctuations
- Removing HTML elements like `<br>`
- Converting to lowercase

In [31]:
def clean_html_column(dataframe, column_name, new_column_name):
    """
    Remove HTML tags from the text in the specified column of the DataFrame
    and create a new column with the cleaned text.

    Parameters:
    - dataframe (DataFrame): The DataFrame containing the text data.
    - column_name (str): The name of the column containing the text to be cleaned.
    - new_column_name (str): The name of the new column to be created with the cleaned text.

    Returns:
    - dataframe (DataFrame): The DataFrame with the new column added, containing the cleaned text.
    """

    def remove_html_tags(text):
        """
        Remove HTML tags from the input text using regex.

        Parameters:
        - text (str): The input text containing HTML tags.

        Returns:
        - clean_text (str): The text with HTML tags removed.
        """
        if isinstance(text, str):
            clean_text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags using regex
            return clean_text
        else:
            return text
    
    # Apply remove_html_tags function to the specified column
    dataframe[new_column_name] = dataframe[column_name].apply(remove_html_tags)
    return dataframe

In [38]:
# retrieving the updated dataframe without html tags
df = clean_html_column(df,'reviewText', 'cleaned_review')
df[['reviewText','cleaned_review']].head()

Unnamed: 0,reviewText,cleaned_review
0,miserable set up. made me want to put my fist ...,miserable set up. made me want to put my fist ...
1,The only thing you are paying for is the Dell ...,The only thing you are paying for is the Dell ...
2,Most of these markers dried out within the fir...,Most of these markers dried out within the fir...
3,Only measures 58x46 instead of 60x46. I see a...,Only measures 58x46 instead of 60x46. I see a...
4,I'll agree the ink itself is exceptionally smo...,I'll agree the ink itself is exceptionally smo...


In [39]:
# removing punctuation and turning into lower case 
df['cleaned_review'] = df['cleaned_review'].apply(lambda review: str(review)\
                                            .translate(str.maketrans('','',punctuation))\
                                            .lower())
df[['reviewText','cleaned_review']].head()

Unnamed: 0,reviewText,cleaned_review
0,miserable set up. made me want to put my fist ...,miserable set up made me want to put my fist i...
1,The only thing you are paying for is the Dell ...,the only thing you are paying for is the dell ...
2,Most of these markers dried out within the fir...,most of these markers dried out within the fir...
3,Only measures 58x46 instead of 60x46. I see a...,only measures 58x46 instead of 60x46 i see a ...
4,I'll agree the ink itself is exceptionally smo...,ill agree the ink itself is exceptionally smoo...
