In [3]:
# dependencies
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [20]:
## setup for natural language tool-kit
import nltk
# nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\WordPress\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [28]:
# tried every review with encoding utf-16 and looked at potential error messages
reviews = []

for i in range(1,4):
    encoding = "utf-8"
    if i == int(3):
        encoding = "utf-16"
    with open(f"./data/reviews{i}.txt", encoding=encoding) as f:
        reviews.append(f.read())

reviews

['Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it\'s singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it\'s better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.\nBromwell High is nothing short of brilliant. Expertly scripted and perfectly delivered, this searing parody of a students and teachers at a South London Public School leaves you literally rolling with laughter. It\'s vulgar, provocative, witty and sharp. The characters are a superbly caricatured cross section of British society (or to be

In [29]:
processed = []
for review in reviews:
    # 1. remove special characters 
    # 2. turn to lower case
    # 3. split into individual words
    # 4. typecast to pd.Series
    processed.append(pd.Series(re.sub(r"[^A-Za-z ]", "", review).lower().split(" ")))

processed

[0           story
 1              of
 2               a
 3             man
 4             who
           ...    
 220      bromwell
 221          high
 222          will
 223           not
 224    disappoint
 Length: 225, dtype: object,
 0        if
 1         i
 2       had
 3       not
 4      read
        ... 
 446       a
 447    pity
 448    that
 449      it
 450    isnt
 Length: 451, dtype: object,
 0      robert
 1      deniro
 2       plays
 3         the
 4        most
         ...  
 232        is
 233      very
 234      good
 235      very
 236      good
 Length: 237, dtype: object]

In [30]:
# typecast to series to count the values:
for word_vec, i in zip(processed, range(1,4)):
    # couning
    print(f"10 most common words in document reviews{i}.txt: \n")
    print(word_vec.value_counts().head(10))


10 most common words in document reviews1.txt: 

a        11
of       10
the       8
and       5
with      4
you       3
is        3
its       3
every     3
to        3
Name: count, dtype: int64
10 most common words in document reviews2.txt: 

the     25
of      18
and     15
i       15
a       11
is      10
to       9
that     8
it       7
film     6
Name: count, dtype: int64
10 most common words in document reviews3.txt: 

is       15
the      12
a         8
of        8
and       7
this      6
it        5
in        4
movie     3
also      3
Name: count, dtype: int64


Now we need to find the top 10 most common words of each document

So we get a lot of stop words as the most common words.

## Task 3
We were ask to apply a word lemmatization and stemming technique in the `nltk`. We are going to use the `PorterStemmer` and `WordNetLemmatizer` classes for this:

In [10]:
wnl = WordNetLemmatizer()
ps = PorterStemmer()

In [32]:
lemmatized = []
stemmed = []
for word_vec in processed:
    # print(pd.Series(word_vec).map(lambda row: nltk.pos_tag(nltk.word_tokenize(row))))
    lemmatized.append(word_vec.map(lambda row: wnl.lemmatize(row)))
    stemmed.append(word_vec.map(lambda row: ps.stem(row)))

print(lemmatized)
print(stemmed)

[0           story
1              of
2               a
3             man
4             who
          ...    
220      bromwell
221          high
222          will
223           not
224    disappoint
Length: 225, dtype: object, 0        if
1         i
2       had
3       not
4      read
       ... 
446       a
447    pity
448    that
449      it
450    isnt
Length: 451, dtype: object, 0      robert
1      deniro
2        play
3         the
4        most
        ...  
232        is
233      very
234      good
235      very
236      good
Length: 237, dtype: object]
[0           stori
1              of
2               a
3             man
4             who
          ...    
220       bromwel
221          high
222          will
223           not
224    disappoint
Length: 225, dtype: object, 0        if
1         i
2       had
3       not
4      read
       ... 
446       a
447    piti
448    that
449      it
450    isnt
Length: 451, dtype: object, 0      robert
1      deniro
2        play
3 

In [35]:
# stopword removal
lemmatized_without_stop = []
stemmed_without_stop = []
for lemmatize, stem in zip(lemmatized, stemmed):
    lemmatized_without_stop.append(lemmatize[~lemmatize.isin(nltk.corpus.stopwords.words("english"))])
    stemmed_without_stop.append(stem[~stem.isin(nltk.corpus.stopwords.words("english"))])


In [36]:
for lem, stem, i in zip(lemmatized_without_stop, stemmed_without_stop, range(1,4)):
    print(f"word count for lemmatized in reviews{i}.txt: \n")
    print(lem.value_counts().head(10))
    print(f"word count for stemmed in reviews{i}.txt: \n")
    print(stem.value_counts().head(10))

word count for lemmatized in reviews1.txt: 

every         3
high          2
absurd        2
better        2
show          2
turned        2
subject       2
future        2
imaginable    2
society       2
Name: count, dtype: int64
word count for stemmed in reviews1.txt: 

everi      3
absurd     2
make       2
better     2
show       2
futur      2
imagin     2
turn       2
subject    2
societi    2
Name: count, dtype: int64
word count for lemmatized in reviews2.txt: 

film        6
            5
high        5
br          4
novel       4
reality     4
union       4
student     4
teacher     4
bromwell    3
Name: count, dtype: int64
word count for stemmed in reviews2.txt: 

film       6
high       5
           5
read       4
realiti    4
student    4
teacher    4
br         4
novel      4
union      4
Name: count, dtype: int64
word count for lemmatized in reviews3.txt: 

movie      3
like       3
also       3
good       3
ten        2
small      2
teacher    2
time       2
every      2
