In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

review_df = pd.read_csv("../../data/labeledTrainData.tsv", header = 0, sep = "\t", quoting = 3)
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [7]:
print(review_df["review"][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [8]:
import re

# <br> html 태그는 replace 함수로 공백으로 변환
review_df["review"] = review_df["review"].str.replace("<br />", " ")

# 파이썬의 정규 표현식 모듈인 re를 이용해 영어 문자열이 아닌 문자는 모두 공백으로 변환
review_df["review"] = review_df["review"].apply(lambda x: re.sub("[^a-zA-Z]", " ", x))

In [9]:
from sklearn.model_selection import train_test_split

class_df = review_df["sentiment"]
feature_df = review_df.drop(["id", "sentiment"], axis = 1, inplace = False)

x_train, x_test, y_train, y_test = train_test_split(feature_df, class_df, test_size = 0.3,
                                                   random_state = 156)

x_train.shape, x_test.shape

((17500, 1), (7500, 1))

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# 스톱 워드는 English, filtering, ngram은 (1, 2)로 설정해 CounterVectorizer 수행.
# LogisticRegressiond의 C는 10으로 설정.
pipeline = Pipeline([
    ("cnt_vect", CountVectorizer(stop_words = "english", ngram_range = (1,2))),
    ("lr_clf", LogisticRegression(C=10))
])

# Pipeline 객체를 이용해 fit(), predict()로 학습 / 예측 수행. predict_proba()는 roc_auc 때문에 수행.
pipeline.fit(x_train["review"], y_train)
pred = pipeline.predict(x_test["review"])
pred_probs = pipeline.predict_proba(x_test["review"])[:, 1]

print("예측정확도는 {0:.4f}, ROC-AUC는 {1:.4f}".format(accuracy_score(y_test, pred),
                                               roc_auc_score(y_test, pred_probs)))

예측정확도는 0.8860, ROC-AUC는 0.9503


In [13]:
# 스톱 워드는 english, filtering, ngram은 (1, 2)로 설정해 TF-IDF 벡터화 수행.
# LogisticRegressiond의 C는 10으로 설정.
pipeline = Pipeline([
    ("tfidf_vect", CountVectorizer(stop_words = "english", ngram_range = (1,2))),
    ("lr_clf", LogisticRegression(C=10))
])

pipeline.fit(x_train["review"], y_train)
pred = pipeline.predict(x_test["review"])
pred_probs = pipeline.predict_proba(x_test["review"])[:, 1]

print("예측정확도는 {0:.4f}, ROC-AUC는 {1:.4f}".format(accuracy_score(y_test, pred),
                                               roc_auc_score(y_test, pred_probs)))

예측정확도는 0.8860, ROC-AUC는 0.9503


In [14]:
from nltk.corpus import wordnet as wn

term = "present"

# "present"라는 단어로 wordnetd의 synsets 생성.
synsets = wn.synsets(term)
print("synsets() 반환 type :", type(synsets))
print("synsets() 반환 값 개수 :", len(synsets))
print("synsets() 반환 값 :", synsets)

synsets() 반환 type : <class 'list'>
synsets() 반환 값 개수 : 18
synsets() 반환 값 : [Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]


In [None]:
for synset in synsets:
    print("### synset name :", synset.name(), "###")
    print("POS :", synset.lexname())
    print("Definition :", synset.definition())
    print("Lemmas:", synset.lemma_names())

In [19]:
# synset 객체를 단어별로 생성
tree = wn.synset("tree.n.01")
lion = wn.synset("lion.n.01")
tiger = wn.synset("tiger.n.02")
cat = wn.synset("cat.n.01")
dog = wn.synset("dog.n.01")

entities = [tree, lion, tiger, cat, dog]
similarities = []
entity_names = [entity.name().split(".")[0] for entity in entities]

# 단어별 synset을 반복하면서 다른 단어의 synset과 유사도를 측정
for entity in entities:
    similarity = [round(entity.path_similarity(compared_entity),2)
                 for compared_entity in entities]
    similarities.append(similarity)
    
# 개별 단어별 synset과 다른 단어의 synset과의 유사도를 DataFrame gudxofh wjwkd
similarity_df = pd.DataFrame(similarities, columns = entity_names, index = entity_names)
similarity_df

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.07,0.07,0.08,0.12
lion,0.07,1.0,0.33,0.25,0.17
tiger,0.07,0.33,1.0,0.25,0.17
cat,0.08,0.25,0.25,1.0,0.2
dog,0.12,0.17,0.17,0.2,1.0


In [None]:
import nltk
from nltk.corpus import sentiwordnet as swn

father