In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from collections import Counter
import pandas as pd
from preprocess import clean_by_freq
from preprocess import clean_by_len
from preprocess import clean_by_stopwords
from preprocess import stemming_by_porter
from preprocess import penn_to_wn
from preprocess import pos_tagger
from preprocess import words_lemmatizer
from preprocess import swn_polarity
nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

%load_ext autoreload
%autoreload 2

# 데이터 불러오기
df = pd.read_csv('imdb.tsv', delimiter = "\\t")

# 대소문자 통합
df['review'] = df['review'].str.lower()

# 문장 토큰화
df['sent_tokens'] = df['review'].apply(sent_tokenize)

# 품사 태깅
df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)

# 표제어 추출
df['lemmatized_tokens'] = df['pos_tagged_tokens'].apply(words_lemmatizer)

# 추가 전처리
stopwords_set = set(stopwords.words('english'))

df['cleaned_tokens'] = df['lemmatized_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\parkf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\parkf\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\parkf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\parkf\AppData\Roaming\nltk_data...
[nlt

In [2]:
df[['pos_tagged_tokens']]

Unnamed: 0,pos_tagged_tokens
0,"[(``, ``), (watching, JJ), (time, NN), (chaser..."
1,"[(i, NN), (saw, VBD), (this, DT), (film, NN), ..."
2,"[(minor, JJ), (spoilers, NNS), (in, IN), (new,..."
3,"[(i, JJ), (went, VBD), (to, TO), (see, VB), (t..."
4,"[(``, ``), (yes, RB), (,, ,), (i, JJ), (agree,..."
5,"[(``, ``), (jennifer, NN), (ehle, NN), (was, V..."
6,"[(amy, JJ), (poehler, NN), (is, VBZ), (a, DT),..."
7,"[(``, ``), (a, DT), (plane, NN), (carrying, VB..."
8,"[(a, DT), (well, NN), (made, VBN), (,, ,), (gr..."
9,"[(``, ``), (incredibly, RB), (dumb, JJ), (and,..."


In [3]:
# SentiWordnet 감성 분석
pos_tagged_words = df['pos_tagged_tokens'][0]
senti_score = 0  # 코퍼스 안의 단어들을 하나씩 순회하며 계산한 각 단어 감성 지수의 총합을 저장하는 변수

for word, tag in pos_tagged_words:
    # PennTreebank Tag로 태깅된 품사를 WordNet Tag 기준으로 변경
    wn_tag = penn_to_wn(tag)
    
    # WordNet Tag에 포함되지 않는 경우는 제외
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
        continue
    
    # Synset 확인, 어휘 사전에 없을 경우에는 제외
    if not wn.synsets(word, wn_tag):
        continue
    else:
        synsets = wn.synsets(word, wn_tag)
    
    # SentiSynset 확인
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())
    
    # 감성 지수 계산
    word_senti_score = (swn_synset.pos_score() - swn_synset.neg_score())
    senti_score += word_senti_score

In [4]:
# 결과 확인
print(senti_score)

-0.375


In [5]:
# dataframe에 swn_polarity() 함수 적용
df['swn_sentiment'] = df['pos_tagged_tokens'].apply(swn_polarity)

df[['review', 'swn_sentiment']]

Unnamed: 0,review,swn_sentiment
0,"""watching time chasers, it obvious that it was...",-0.375
1,i saw this film about 20 years ago and remembe...,-1.5
2,"minor spoilers in new york, joan barnard (elvi...",-2.25
3,i went to see this film with a great deal of e...,-0.5
4,"""yes, i agree with everyone on this site this ...",3.0
5,"""jennifer ehle was sparkling in \""""pride and p...",6.75
6,amy poehler is a terrific comedian on saturday...,0.75
7,"""a plane carrying employees of a large biotech...",8.75
8,"a well made, gritty science fiction movie, it ...",4.5
9,"""incredibly dumb and utterly predictable story...",-1.125


In [6]:
df['review'][0]

'"watching time chasers, it obvious that it was made by a bunch of friends. maybe they were sitting around one day in film school and said, \\""hey, let\'s pool our money together and make a really bad movie!\\"" or something like that. what ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. all corners were cut, except the one that would have prevented this film\'s release. life\'s like that."'

In [9]:
df['review'][1]

"i saw this film about 20 years ago and remember it as being particularly nasty. i believe it is based on a true incident: a young man breaks into a nurses' home and rapes, tortures and kills various women. it is in black and white but saves the colour for one shocking shot. at the end the film seems to be trying to make some political statement but it just comes across as confused and obscene. avoid."

In [10]:
df['review'][8]

"a well made, gritty science fiction movie, it could be lost among hundreds of other similar movies, but it has several strong points to keep it near the top. for one, the writing and directing is very solid, and it manages for the most part to avoid many sci-fi cliches, though not all of them. it does a good job of keeping you in suspense, and the landscape and look of the movie will appeal to sci-fi fans. if you're looking for a masterpiece, this isn't it. but if you're looking for good old fashioned post-apoc, gritty future in space sci-fi, with good suspense and special effects, then this is the movie for you. thoroughly enjoyable, and a good ending."