### Initial Project Idea and Setup

In [2]:
"""Import Libraries"""

# import basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

# import datasets
from datasets import load_dataset

# import libraries to clean text
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer



In [3]:
# get imdb dataset
ds = load_dataset("imdb")

In [4]:
# view structure
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
# store train and test in seperate dataframes
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

In [6]:
# view one of the text outputs
ds_train['text'][1]

'"I Am Curious: Yellow" is a risible and pretentious steaming pile. It doesn\'t matter what one\'s political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn\'t true. I\'ve seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don\'t exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we\'re treated to the site of Vincent Gallo\'s throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) "double-standard" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, and the s

## Preprocessing Functions

In [None]:
# function to remove html tags
def remove_html_tags(review):
    clean_text = re.sub('<.*?>', '', review)
    return clean_text

# function to remove punctuation
def remove_punctuation(review):
    no_punc = review.translate(str.maketrans("","", string.punctuation))
    return no_punc

# function for tokenization
def tokenization(review):
    tokens = review.lower().split()
    return tokens

# function to remove stopwords
def remove_stopwords(review):
    stop_words = set(stopwords.words('english'))
    no_stop_words = [word for word in review if word not in stop_words]
    return no_stop_words

#function to lemmatize
def lemmatize(review):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in review]
    return lemmatized_words

# function for stem word
def stemming (review):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in review]
    return stemmed_words

In [7]:
'''Because a BERT model will be used, preprocessing will be kept
to a minimum with html and punction_removal and tokenization in order to keep context and nuance'''

# function to remove html tags
def remove_html_tags(review):
    clean_text = re.sub('<.*?>', '', review)
    return clean_text

# function to remove punctuation
def remove_punctuation(review):
    no_punc = review.translate(str.maketrans("","", string.punctuation))
    return no_punc


# function for tokenization
def tokenization(review):
    tokens = review.lower().split()
    return tokens


# make a preprocessing function that
def preprocessing(review):
    preprocess = review['text'].apply(lambda x: remove_html_tags(x))
    preprocess = preprocess.apply(lambda x: remove_punctuation(x))
    preprocess= preprocess.apply(lambda x: tokenization(x))
    return preprocess