In [50]:
import torch
import torch.nn as nn
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
from pathlib import Path

import re
import string

#### NLPs
***
- NLP (Natural Language Processing) is a machine learning model that is used to read and understand text

#### Process of NLP Implementation
***
1. Loading the data
2. Preprocessing
3. Tokenization
4. Word Embedding
5. Implementing Recurrent Neural Network
6. Hyperparameter Training 

#### 1. Loading the Data
***
- Going to be using IMDBs movie review dataset
- Using `Pandas` to read through csv file

In [34]:
imdb_data = pd.read_csv(r"C:\Users\Brand\project_env\PyTorch Recurrent Neural Network\data\IMDB Dataset.csv")

In [35]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


#### 2. Preprocessing of an NLP
***
- lowercasing words and removing html tags or punctuations
- Tokenization breaks a sentence into individual units of words or phrases. 
- Stemming and lemmatization simplify words into their root form. For example, these processes turn "starting" into "start." 
- Stop word removal ensures that words that do not add significant meaning to a sentence, such as "for" and "with," are removed


Step by Step Process of Text Preprocessing: [GeekByGeek Text Preprocessing Tutorial](https://www.geeksforgeeks.org/text-preprocessing-in-python-set-1/)

In [43]:
stop_words = stopwords.words('english')

In [52]:
# remove html tags


imdb_data.review.iloc[1]

HTML_TAGS = re.compile("<[^>]+>")

def remove_tags(text):
    return HTML_TAGS.sub("", text)

def stem_words(text):
    stemmer = PorterStemmer()
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems

remove_tags(imdb_data.review.iloc[1])

sentence = re.sub("[^a-zA-Z]", " ", remove_tags(imdb_data.review.iloc[1])).lower()

sentence = re.sub(r'\d+', '', sentence)

stem_words(sentence)

['a',
 'wonder',
 'littl',
 'product',
 'the',
 'film',
 'techniqu',
 'is',
 'veri',
 'unassum',
 'veri',
 'old',
 'time',
 'bbc',
 'fashion',
 'and',
 'give',
 'a',
 'comfort',
 'and',
 'sometim',
 'discomfort',
 'sens',
 'of',
 'realism',
 'to',
 'the',
 'entir',
 'piec',
 'the',
 'actor',
 'are',
 'extrem',
 'well',
 'chosen',
 'michael',
 'sheen',
 'not',
 'onli',
 'ha',
 'got',
 'all',
 'the',
 'polari',
 'but',
 'he',
 'ha',
 'all',
 'the',
 'voic',
 'down',
 'pat',
 'too',
 'you',
 'can',
 'truli',
 'see',
 'the',
 'seamless',
 'edit',
 'guid',
 'by',
 'the',
 'refer',
 'to',
 'william',
 'diari',
 'entri',
 'not',
 'onli',
 'is',
 'it',
 'well',
 'worth',
 'the',
 'watch',
 'but',
 'it',
 'is',
 'a',
 'terrificli',
 'written',
 'and',
 'perform',
 'piec',
 'a',
 'master',
 'product',
 'about',
 'one',
 'of',
 'the',
 'great',
 'master',
 's',
 'of',
 'comedi',
 'and',
 'hi',
 'life',
 'the',
 'realism',
 'realli',
 'come',
 'home',
 'with',
 'the',
 'littl',
 'thing',
 'the',
 