"""5가지 요구사항 : 

1. 웹크롤링을 통하여 관심주제에 대한 감정분석 데이터를 수집하고 수집한 내용과 코드 설명을 병행하여 제시하세요

   *참조 : 7주차 실습 코드 3. 종합실습 코드를 참조하여 새롭게 크롤링

2. Konlpy 또는 정규표현식(re) 라이브러리를 이용하여 위에서 수집한 감정분석에 대한 택스트 데이터에 대하여 전처리를 하고 적용한 절차와 코드를 설명하세요. 

3. 2항의 전처리 결과를 문서별 코퍼스를 토큰화하여 DTM, TF-IDF에 의한 대략적인 키워드에 대한 빈도를 분석하고 결과를 설명하세요. 

4. 위에서 처리된 내용을 기초로 로지스틱 회귀분석에 의한 감정분석을 실시하고 계수들의 웨이트를 이용한 긍정과 부정의 키워드를 시각화하여 보여주고 코드와 결과를 설명하세요.

5. 위에서 분석한 결과를 기초로 빈도수와 긍정 부정 키워드의 관계, 자료의 불균형과 모형의 예측 정확도의 관계에 대하여 위에서 분석한 사례를 들어 논하세요.   

"""
문제의식: 영화 평가에 있어 시대에 따른 표현법의 차이에 대한 의문
절차 1: 2014 ~ 2023 각 연도별 흥행 상위 10위 영화 목록을 구한다.
절차 2: 2019 ~ 2023 개봉영화와 2014 ~ 2018 개봉영화의 별점과 평가를 수집한다.
절차 3: 각각의 코퍼스를 train set과 test set으로 구분한다.
절차 4: 2019 ~ 2023 개봉영화 train set으로 훈련한 모델을 두 개의 test set에 적용하여 정확도를 비교한다
절차 5: 2014 ~ 2013 개봉영화 train set으로 훈련한 모델을 두 개의 test set에 적용하여 정확도를 비교한다
예측 결과1: 시의 적절한 훈련 데이터 수집의 필요성에 대해 할 수 있다.
예측 결과2: 시대에 따른 영화 평가 방식의 표현법 차이를 수치화 할 수 있다.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
from urllib.request import Request, urlopen
import pandas as pd
import time
import requests
import json
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from sklearn.feature_extraction.text import TfidfTransformer
from collections import Counter

# 1. 데이터 크롤링

In [None]:
#기간 내 전체 영화 대상으로는 크롤링 시간이 장시간 소모되어 대상변경
#영진위 api로부터 각 년도간 영화정보 저장
# start_year = 2014
# end_year = 2023
# api_key = '1b7d7d5da614bb2d2974f2a6ca95cded'
# raw_df = pd.DataFrame(index=range(0,1), columns = ['moiveListResult','moiveList'])

# for page_num in range(1,1469):
#     api_url = f'http://kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieList.json?key={api_key}&openStartDt={start_year}&openEndDt={end_year}&curPage={page_num}'
#     request = Request(api_url)
#     response = urlopen(request).read()
#     response_json = json.loads(response)
#     response_df = pd.DataFrame(response_json['movieListResult']['movieList'])
#     raw_df = pd.concat([raw_df, response_df], axis=0)
#     print(str(page_num) + '페이지 조회했습니다')

In [None]:
#수동으로 구한 연도별 흥행성적 top 10 영화 정보 Import
movie_ranking_df = pd.read_csv('movie_ranking.csv')

In [None]:
movie_ranking_df.info()

In [None]:
movie_ranking_df['개봉일'] = movie_ranking_df['개봉일'].str[:4].astype(int)

In [None]:
movie_ranking_df.head()

In [None]:
df_smaller_than_2019 = movie_ranking_df[movie_ranking_df['개봉일'] < 2019]
df_greater_or_equal_to_2019 = movie_ranking_df[movie_ranking_df['개봉일'] >= 2019]

In [None]:
older_movie_names = df_smaller_than_2019['영화명'].tolist()
newer_movie_names = df_greater_or_equal_to_2019['영화명'].tolist()

In [None]:
print(older_movie_names[:5])
print(newer_movie_names[:5])

In [None]:
data = []
times = 0
for movie_name in older_movie_names:
    # 드라이버 사용해서 크롬 열기
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome()
    driver.get(f'https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=영화+{movie_name}+평점')
    time.sleep(15)
    driver.execute_script("window.scrollTo(0, 800)")
    time.sleep(2)

    # 스크롤용 경로 생성
    x_path = "/html/body/div[3]/div[2]/div/div[1]/div[2]/div[2]/div[2]/div/div[2]/div[6]"
    try:
        to_scroll = driver.find_element_by_xpath(x_path)
        
    except Exception as e:
        print(f"An error occurred: {str(e)} for {movie_name}")
        continue
        
    # 별점과 코멘트 초기화
    counts = 0
    movie_comments = []
    movie_ratings = []

    while True:

        all_contents = driver.find_elements(By.CSS_SELECTOR, "div.lego_review_list._scroller")

        flag = False  # Flag variable to track if break has already been executed

        for content in all_contents:
            li_comments = content.find_elements(By.CSS_SELECTOR, "span.desc._text")
            li_ratings = content.find_elements(By.CSS_SELECTOR, "div.area_text_box")
            for li_comment in li_comments[counts:]:
                movie_comments.append(li_comment.text)

            for li_rating in li_ratings[counts:]:
                rating_value = li_rating.get_attribute("textContent").replace("별점(10점 만점 중)", "").strip()
                movie_ratings.append(int(rating_value))

        if len(movie_comments) == counts:
            flag = True  # Set flag to True if the break statement is executed
            break

        if flag:
            print('종료')
            break
        counts = len(movie_comments)
        times += 1
        print(str(movie_name) +' ' + str(times) + ' 회 실행했습니다')

        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", to_scroll)
        time.sleep(3)

        
    if len(movie_comments) == 0:
        continue
    driver.quit()
    # 영화정보로 딕셔너리 생성
    movie_data = {
        'movieName': [movie_name] * len(movie_comments),
        'comments': movie_comments,
        'ratings': movie_ratings
    }

    # data 리스트에 정보추가
    data.extend(pd.DataFrame(movie_data).to_dict('records'))

older_movie_df = pd.DataFrame(data)


In [None]:
data = []
times = 0
for movie_name in newer_movie_names:
    # 드라이버 사용해서 크롬 열기
    driver = webdriver.Chrome('chromedriver_mac_arm64/chromedriver')
    driver.get(f'https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=영화+{movie_name}+평점')
    time.sleep(15)
    driver.execute_script("window.scrollTo(0, 800)")
    time.sleep(2)

    # 스크롤용 경로 생성
    x_path = "/html/body/div[3]/div[2]/div/div[1]/div[2]/div[2]/div[2]/div/div[2]/div[6]"
    try:
        to_scroll = driver.find_element_by_xpath(x_path)
        
    except Exception as e:
        print(f"An error occurred: {str(e)} for {movie_name}")
        continue
        
    # 별점과 코멘트 초기화
    counts = 0
    movie_comments = []
    movie_ratings = []

    while True:

        all_contents = driver.find_elements(By.CSS_SELECTOR, "div.lego_review_list._scroller")

        flag = False  # Flag variable to track if break has already been executed

        for content in all_contents:
            li_comments = content.find_elements(By.CSS_SELECTOR, "span.desc._text")
            li_ratings = content.find_elements(By.CSS_SELECTOR, "div.area_text_box")
            for li_comment in li_comments[counts:]:
                movie_comments.append(li_comment.text)

            for li_rating in li_ratings[counts:]:
                rating_value = li_rating.get_attribute("textContent").replace("별점(10점 만점 중)", "").strip()
                movie_ratings.append(int(rating_value))

        if len(movie_comments) == counts:
            flag = True  # Set flag to True if the break statement is executed
            break

        if flag:
            print('종료')
            break
        counts = len(movie_comments)
        times += 1
        print(str(movie_name) + ' ' + str(times) + ' 회 실행했습니다')

        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", to_scroll)
        time.sleep(3)

        
    if len(movie_comments) == 0:
        continue
    driver.quit()
    # 영화정보로 딕셔너리 생성
    movie_data = {
        'movieName': [movie_name] * len(movie_comments),
        'comments': movie_comments,
        'ratings': movie_ratings
    }

    # data 리스트에 정보추가
    data.extend(pd.DataFrame(movie_data).to_dict('records'))


newer_movie_df = pd.DataFrame(data)


In [None]:
#csv 파일로 데이터 백업
newer_movie_df.to_csv('newer_movie_df.csv')
older_movie_df.to_csv('older_movie_df.csv')

In [None]:
#정규표현식으로 데이터 1차 처리
import re
older_movie_df['comments'] = older_movie_df['comments'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
newer_movie_df['comments'] = newer_movie_df['comments'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

In [None]:
#별점에 따른 긍정/부정 분리
older_movie_df['PN'] = older_movie_df['ratings'].apply(lambda rating: 1 if rating >= 6 else 0)
newer_movie_df['PN'] = newer_movie_df['ratings'].apply(lambda rating: 1 if rating >= 6 else 0)

In [None]:
newer_movie_df.info()

In [None]:
#stopwords 구성
#불용어 사전 출처 https://www.ranks.nl/stopwords/korean
stop_words = []
with open(stop_words_txt) as f:
    lines = f.readlines()

stop_words = [line.rstrip('\n') for line in lines]
print(stop_words[:5])

In [None]:
from konlpy.tag import Mecab

In [None]:
mecab = Mecab()

In [None]:
older_movie_df.head()

In [None]:
older_tokens ,older_corpus = [],[]
for comment in older_movie_df['comments']:
    temp_X = mecab.morphs(comment) # 토큰화
    temp_X = [word for word in temp_X if not word in stop_words] # 불용어 제거
    older_tokens.append(temp_X)
    older_corpus.append(' '.join(temp_X))

In [None]:
newer_tokens ,newer_corpus = [],[]
for comment in newer_movie_df['comments']:
    temp_X = mecab.morphs(comment) # 토큰화
    temp_X = [word for word in temp_X if not word in stop_words] # 불용어 제거
    newer_tokens.append(temp_X)
    newer_corpus.append(' '.join(temp_X))

In [None]:
#오래된 영화 긍정/부정 토큰 저장
p_older_tokens, n_older_tokens = [], []

for i, token in enumerate(older_tokens):
    if older_movie_df['PN'].values[i] == 1:  
        p_older_tokens.append(token)
    else :
        n_older_tokens.append(token)

print(n_older_tokens)

In [None]:
#최신 영화 긍정/부정 토큰 저장
p_newer_tokens, n_newer_tokens = [], []

for i, token in enumerate(newer_tokens):
    if newer_movie_df['PN'].values[i] == 1: 
        p_newer_tokens.append(token)
    else :
        n_newer_tokens.append(token)

print(n_newer_tokens)
print(p_newer_tokens)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import gensim

In [None]:
n_older_corpus, p_older_corpus = [],[]
for tokens in p_older_tokens:
    for token in tokens:
        p_older_corpus.append(''.join(token))
for tokens in n_older_tokens:
    for token in tokens:
        n_older_corpus.append(''.join(token))
p1_older_corpus = ' '.join(p_older_corpus).split()
n1_older_corpus = ' '.join(n_older_corpus).split()
print(n1_older_corpus)

In [None]:
n_newer_corpus, p_newer_corpus = [],[]
for tokens in p_newer_tokens:
    for token in tokens:
        p_newer_corpus.append(''.join(token))
for tokens in n_newer_tokens:
    for token in tokens:
        n_newer_corpus.append(''.join(token))
p1_newer_corpus = ' '.join(p_newer_corpus).split()
n1_newer_corpus = ' '.join(n_newer_corpus).split()
print(n1_newer_corpus)

In [None]:
type(p1_older_corpus)

In [None]:
older_counter_p = Counter(p1_older_corpus)
older_counter_n = Counter(n1_older_corpus)
newer_counter_p = Counter(p1_newer_corpus)
newer_counter_n = Counter(n1_newer_corpus)
print(older_counter_p.most_common(10))
print(older_counter_n.most_common(10))
print(newer_counter_p.most_common(10))
print(newer_counter_n.most_common(10))

In [None]:
import matplotlib.font_manager as fm

# 설치된 폰트 출력
font_list = [font.name for font in fm.fontManager.ttflist]
font_list

In [None]:
%matplotlib inline
import nltk
import matplotlib.pyplot as plt
pos = nltk.Text(p1_older_corpus)
neg = nltk.Text(n1_older_corpus)
plt.rcParams['font.family'] = 'AppleMyungjo'

plt.figure(1)
pos.plot(30)

plt.figure(2)
neg.plot(30)
plt.show()

In [None]:
pos = nltk.Text(p1_newer_corpus)
neg = nltk.Text(n1_newer_corpus)
plt.rcParams['font.family'] = 'AppleMyungjo'

plt.figure(1)
pos.plot(30)

plt.figure(2)
neg.plot(30)
plt.show()

# 전체 텍스트 대상 tf-idf 구성

In [None]:
g_dictionary = gensim.corpora.Dictionary(older_tokens)
g_corpus = [g_dictionary.doc2bow(text) for text in older_tokens]
vector = CountVectorizer(vocabulary=g_dictionary.token2id)
older_dtm = vector.fit_transform(older_corpus).toarray()
col = g_dictionary.token2id.keys()
pd.DataFrame(older_dtm, columns=col)

In [None]:
tfidf_vectorizer = TfidfTransformer()
older_tf_idf = tfidf_vectorizer.fit_transform(older_dtm).toarray()
col = g_dictionary.token2id.keys()
print(older_tf_idf.shape)
pd.DataFrame(older_tf_idf, columns=col)

In [None]:
print(older_tf_idf)

In [None]:
pd_older_tf_idf = pd.DataFrame(older_tf_idf, columns=col)
pd_older_tf_idf.sum().sort_values(ascending=False)[:40]

In [None]:
g_dictionary = gensim.corpora.Dictionary(newer_tokens)
g_corpus = [g_dictionary.doc2bow(text) for text in newer_tokens]
vector = CountVectorizer(vocabulary=g_dictionary.token2id)
newer_dtm = vector.fit_transform(newer_corpus).toarray()
col = g_dictionary.token2id.keys()
pd.DataFrame(newer_dtm, columns=col)

In [None]:
tfidf_vectorizer = TfidfTransformer()
newer_tf_idf = tfidf_vectorizer.fit_transform(newer_dtm).toarray()
col = g_dictionary.token2id.keys()
print(newer_tf_idf.shape)
pd.DataFrame(newer_tf_idf, columns=col)

In [None]:
pd_newer_tf_idf = pd.DataFrame(newer_tf_idf, columns=col)
pd_newer_tf_idf.sum().sort_values(ascending=False)[:40]

# 3.로지스틱 회귀에 의한 감정분석

In [None]:
older_movie_df['PN'].value_counts()

In [None]:
newer_movie_df['PN'].value_counts()

In [None]:
X = older_tf_idf
y = older_movie_df['PN']
older_data = pd.concat([y,pd.DataFrame(X)],axis = 1)

In [None]:
X = newer_tf_idf
y = newer_movie_df['PN']
newer_data = pd.concat([y,pd.DataFrame(X)],axis = 1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
older_train_df,older_test_df = train_test_split(older_data, test_size = 0.2, random_state=256)
newer_train_df,newer_test_df = train_test_split(newer_data, test_size = 0.2, random_state=256)
older_train_df.head()

In [None]:
older_x_train = older_train_df.drop(['PN'], axis=1)
older_y_train = older_train_df['PN']
older_x_test = older_test_df.drop(['PN'], axis=1)
older_y_test = older_test_df['PN']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# fit in training set
older_lr = LogisticRegression(random_state = 0)
older_lr.fit(older_x_train,older_y_train)

# predict in test set
older_y_pred = older_lr.predict(older_x_test)

older_a1 = accuracy_score(older_y_test, older_y_pred)
older_p1 = precision_score(older_y_test, older_y_pred)
older_r1 =recall_score(older_y_test, older_y_pred)
older_f1= f1_score(older_y_test, older_y_pred)
print('accuracy: %.2f' % older_a1 )
print('precision: %.2f' % older_p1)
print('recall: %.2f' % older_r1)
print('F1: %.2f' % older_f1)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
confu = confusion_matrix(y_true = older_y_test, y_pred = older_y_pred)

plt.figure(figsize=(4, 3))
sns.heatmap(confu, annot=True, annot_kws={'size':15}, cmap='OrRd', fmt='.10g')
plt.title('Confusion Matrix')
plt.show()

In [None]:
older_positive_random_idx = older_movie_df[older_movie_df['PN']==1].sample(1000, random_state=12).index.tolist()
older_negative_random_idx = older_movie_df[older_movie_df['PN']==0].sample(1000, random_state=12).index.tolist()

In [None]:
older_random_idx = older_positive_random_idx + older_negative_random_idx
x = older_tf_idf[older_random_idx]
y = older_movie_df['PN'][older_random_idx]
older_x_train2, older_x_test2, older_y_train2, older_y_test2 = train_test_split(x, y, test_size=0.2, random_state=1)

In [None]:
older_lr2 = LogisticRegression(random_state = 0)
older_lr2.fit(older_x_train2, older_y_train2)
older_y_pred2 = older_lr2.predict(older_x_test2)

In [None]:
# predict in test set
older_a2 = accuracy_score(older_y_test2, older_y_pred2)
older_p2 = precision_score(older_y_test2, older_y_pred2)
older_r2 = recall_score(older_y_test2, older_y_pred2)
older_f2 = f1_score(older_y_test2, older_y_pred2)
print('accuracy: %.2f' % older_a2 )
print('precision: %.2f' % older_p2)
print('recall: %.2f' % older_r2)
print('F1: %.2f' % older_f2)

In [None]:
# 개선된 confusion matrix

from sklearn.metrics import confusion_matrix

confu = confusion_matrix(y_true = older_y_test2, y_pred = older_y_pred2)

plt.figure(figsize=(4, 3))
sns.heatmap(confu, annot=True, annot_kws={'size':15}, cmap='OrRd', fmt='.10g')
plt.title('Confusion Matrix')
plt.show()

In [None]:
pd.options.display.float_format = '{:.2f}'.format
tr1 = pd.DataFrame([older_a1,older_p1,older_r1,older_f1])
tr2 = pd.DataFrame([older_a2,older_p2,older_r2,older_f2])

test_result = pd.concat([tr1.T, tr2.T], axis=0)
test_result.columns=['Accuracy(정확도)', 'Precision(정밀도)', 'Recall(재현율)', 'F1']
test_result.index = ['1차 Imbalance Data', '2차 Under-sampling']
test_result

In [None]:
older_lr2.coef_

In [None]:
# print logistic regression's coef

plt.figure(figsize=(10, 8))
plt.bar(range(len(older_lr.coef_[0])), older_lr.coef_[0])

In [None]:
print(sorted(((value, index) for index, value in enumerate(older_lr.coef_[0])), reverse = True)[:5])
print(sorted(((value, index) for index, value in enumerate(older_lr.coef_[0])), reverse = True)[-5:])

In [None]:
older_coef_pos_index = sorted(((value, index) for index, value in enumerate(older_lr2.coef_[0])), reverse = True)
older_coef_neg_index = sorted(((value, index) for index, value in enumerate(older_lr2.coef_[0])), reverse = False)

In [None]:
invert_index_vectorizer = older_id_to_word

In [None]:
older_pos_top_word=[] 
older_pos_top_score=[]
for coef in older_coef_pos_index[:10]:
    print(invert_index_vectorizer[coef[1]], coef[0])
    older_pos_top_word.append(invert_index_vectorizer[coef[1]])
    older_pos_top_score.append(coef[0])

In [None]:
older_neg_top_word=[] 
older_neg_top_score=[]
for coef in older_coef_neg_index[:10]:
    print(invert_index_vectorizer[coef[1]], coef[0])
    older_neg_top_word.append(invert_index_vectorizer[coef[1]])
    older_neg_top_score.append(coef[0])

In [None]:
older_pos_top_word.reverse()
older_pos_top_score.reverse()
older_top_word=older_neg_top_word+ older_pos_top_word
older_top_score = older_neg_top_score+ older_pos_top_score

In [None]:
plt.figure(figsize=(10, 4))
plt.rcParams["axes.unicode_minus"] = False # 한글 폰트 사용시 - 깨지는 문제 해결

plt.bar(older_neg_top_word, older_neg_top_score, label = "부정", color = 'r')
plt.bar(older_pos_top_word, older_pos_top_score, label = "긍정", color = 'g')

plt.bar(range(len(older_top_score)), older_top_score)
plt.xticks(range(len(older_top_word)), older_top_word)
plt.show()

In [None]:
plt.figure(figsize=[14, 10])
plt.barh(older_neg_top_word, older_neg_top_score, label = "부정", color = 'r')
plt.barh(older_pos_top_word, older_pos_top_score, label = "긍정", color = 'g')
plt.legend()
plt.xlabel('키워드별 Vectorized Score')
plt.ylabel('Top 10 키워드')
# Giving the tilte for the plot
plt.title('2014~2018 개봉영화 후기 감정 분석')
# Saving the plot as a 'png'
plt.savefig('2BarPlot.png')
# Displaying the bar plot
plt.show()

In [None]:
newer_x_train = newer_train_df.drop(['PN'], axis=1)
newer_y_train = newer_train_df['PN']
newer_x_test = newer_test_df.drop(['PN'], axis=1)
newer_y_test = newer_test_df['PN']

In [None]:
# fit in training set
newer_lr = LogisticRegression(random_state = 0)
newer_lr.fit(newer_x_train,newer_y_train)

# predict in test set
newer_y_pred = newer_lr.predict(newer_x_test)

newer_a1 = accuracy_score(newer_y_test, newer_y_pred)
newer_p1 = precision_score(newer_y_test, newer_y_pred)
newer_r1 =recall_score(newer_y_test, newer_y_pred)
newer_f1= f1_score(newer_y_test, newer_y_pred)
print('accuracy: %.2f' % newer_a1)
print('precision: %.2f' % newer_p1)
print('recall: %.2f' % newer_r1)
print('F1: %.2f' % newer_f1)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
confu = confusion_matrix(y_true = newer_y_test, y_pred = newer_y_pred)

plt.figure(figsize=(4, 3))
sns.heatmap(confu, annot=True, annot_kws={'size':15}, cmap='OrRd', fmt='.10g')
plt.title('Confusion Matrix')
plt.show()

In [None]:
newer_positive_random_idx = newer_movie_df[newer_movie_df['PN']==1].sample(1000, random_state=12).index.tolist()
newer_negative_random_idx = newer_movie_df[newer_movie_df['PN']==0].sample(1000, random_state=12).index.tolist()

In [None]:
newer_random_idx = newer_positive_random_idx + newer_negative_random_idx
x = newer_tf_idf[newer_random_idx]
y = newer_movie_df['PN'][newer_random_idx]
newer_x_train2, newer_x_test2, newer_y_train2, newer_y_test2 = train_test_split(x, y, test_size=0.25, random_state=1)

In [None]:
newer_lr2 = LogisticRegression(random_state = 0)
newer_lr2.fit(newer_x_train2, newer_y_train2)
newer_y_pred2 = newer_lr2.predict(newer_x_test2)

In [None]:
# predict in test set
newer_a2 = accuracy_score(newer_y_test2, newer_y_pred2)
newer_p2 = precision_score(newer_y_test2, newer_y_pred2)
newer_r2 = recall_score(newer_y_test2,newer_y_pred2)
newer_f2 = f1_score(newer_y_test2, newer_y_pred2)
print('accuracy: %.2f' % newer_a2)
print('precision: %.2f' % newer_p2)
print('recall: %.2f' % newer_r2)
print('F1: %.2f' % newer_f2)

In [None]:
# 개선된 confusion matrix

from sklearn.metrics import confusion_matrix

confu = confusion_matrix(y_true = newer_y_test2, y_pred = newer_y_pred2)

plt.figure(figsize=(4, 3))
sns.heatmap(confu, annot=True, annot_kws={'size':15}, cmap='OrRd', fmt='.10g')
plt.title('Confusion Matrix')
plt.show()

pd.options.display.float_format = '{:.2f}'.format
tr1 = pd.DataFrame([newer_a1,newer_p1,newer_r1,newer_f1])
tr2 = pd.DataFrame([newer_a2,newer_p2,newer_r2,newer_f2])

test_result = pd.concat([tr1.T, tr2.T], axis=0)
test_result.columns=['Accuracy(정확도)', 'Precision(정밀도)', 'Recall(재현율)', 'F1']
test_result.index = ['1차 Imbalance Data', '2차 Under-sampling']
test_result

In [None]:
newer_lr.coef_

In [None]:
# print logistic regression's coef

plt.figure(figsize=(10, 8))
plt.bar(range(len(newer_lr.coef_[0])), newer_lr.coef_[0])

print(sorted(((value, index) for index, value in enumerate(newer_lr.coef_[0])), reverse = True)[:5])
print(sorted(((value, index) for index, value in enumerate(newer_lr.coef_[0])), reverse = True)[-5:])
# enumerate: 인덱스 번호와 컬렉션의 원소를 tuple형태로 반환함

newer_coef_pos_index = sorted(((value, index) for index, value in enumerate(newer_lr.coef_[0])), reverse = True)
newer_coef_neg_index = sorted(((value, index) for index, value in enumerate(newer_lr.coef_[0])), reverse = False)
newer_coef_pos_index[:10]
newer_coef_neg_index[:10]

In [None]:
# 일부 중요 원소들 단어사전 확인
print('긍정리뷰 키워드: ', newer_id_to_word[79],newer_id_to_word[92]) 
print('부정리뷰 키워드: ', newer_id_to_word[416],newer_id_to_word[513]) 

In [None]:
invert_index_vectorizer = newer_id_to_word

newer_pos_top_word=[] 
newer_pos_top_score=[]
for coef in newer_coef_pos_index[:10]:
    print(invert_index_vectorizer[coef[1]], coef[0])
    newer_pos_top_word.append(invert_index_vectorizer[coef[1]])
    newer_pos_top_score.append(coef[0])

newer_neg_top_word=[] 
newer_neg_top_score=[]
for coef in newer_coef_neg_index[:10]:
    print(invert_index_vectorizer[coef[1]], coef[0])
    newer_neg_top_word.append(invert_index_vectorizer[coef[1]])
    newer_neg_top_score.append(coef[0])


In [None]:
newer_pos_top_word.reverse()
newer_pos_top_score.reverse()
newer_top_word=newer_neg_top_word+ newer_pos_top_word
newer_top_score = newer_neg_top_score+ newer_pos_top_score

In [None]:
plt.figure(figsize=(10, 4))
plt.rcParams["axes.unicode_minus"] = False # 한글 폰트 사용시 - 깨지는 문제 해결

plt.bar(newer_neg_top_word, newer_neg_top_score, label = "부정", color = 'r')
plt.bar(newer_pos_top_word, newer_pos_top_score, label = "긍정", color = 'g')

plt.bar(range(len(newer_top_score)), newer_top_score)
plt.xticks(range(len(newer_top_word)), newer_top_word)
plt.show()

In [None]:
plt.figure(figsize=[14, 10])
plt.barh(newer_neg_top_word, newer_neg_top_score, label = "부정", color = 'r')
plt.barh(newer_pos_top_word, newer_pos_top_score, label = "긍정", color = 'g')
plt.legend()
plt.xlabel('키워드별 Vectorized Score')
plt.ylabel('Top 10 키워드')
# Giving the tilte for the plot
plt.title('2019~2023 개봉영화 후기 감정 분석')
# Saving the plot as a 'png'
plt.savefig('2BarPlot.png')
# Displaying the bar plot
plt.show()

# 모델별 크로스 검증
1. 구영화 훈련 > 신영화 검증
2. 신영화 훈련 > 구영화 검증
필요 작업: 각 test_set의 feature 개수 통일 

In [None]:
#cross체크용 회귀모델 재훈련
cross_older_lr = LogisticRegression(random_state = 0)
cross_older_lr.fit(older_x_train2, older_y_train2)
cross_older_y_pred = cross_older_lr.predict(newer_x_test2)

In [None]:
cross_older_a1 = accuracy_score(newer_y_test2, cross_older_y_pred)
cross_older_p1 = precision_score(newer_y_test2, cross_older_y_pred)
cross_older_r1 = recall_score(newer_y_test2,cross_older_y_pred)
cross_older_f1 = f1_score(newer_y_test2, cross_older_y_pred)
print('accuracy: %.2f' % cross_older_a1)
print('precision: %.2f' % cross_older_p1)
print('recall: %.2f' % cross_older_r1)
print('F1: %.2f' % cross_older_f1)

In [None]:
cross_newer_lr = LogisticRegression(random_state = 0)
cross_newer_lr.fit(newer_x_train, newer_y_train)
cross_newer_y_pred = cross_newer_lr.predict(older_x_test)