In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.family'] = 'AppleGothic'

In [None]:
reviews = pd.read_csv('/content/drive/MyDrive/archive/coupang_reviews.csv', encoding='utf-8', sep='\t')
reviews

In [None]:
reviews.info()

In [None]:
reviews['rating'].value_counts()

In [None]:
reviews[reviews['rating'] == '4']

In [None]:
len(reviews[~reviews['rating'].isin(['1', '2', '3', '4', '5'])])

7917

In [None]:
# 평점이 1, 2, 3, 4, 5가 아닌 것들을 제거
reviews = reviews[reviews['rating'].isin(['1', '2', '3', '4', '5'])]
reviews['rating'] = reviews['rating'].astype('int')

In [None]:
# headline, review_content 모두 결측치 있는 row 제거
reviews = reviews[reviews[['headline','review_content']].isnull().sum(axis=1) == 0]

In [None]:
# headline, review_content 모두 내용이 없는 row
reviews[reviews['headline'].str.contains('등록된 헤드라인이') & reviews['review_content'].str.contains('등록된 리뷰내용이')]

Unnamed: 0,rating,headline,review_content


In [None]:
reviews['rating'].value_counts().sort_index(ascending=False)

In [None]:
# 평점 분포 확인

x = [f"{rating} ({count})" for rating, count in reviews['rating'].value_counts().sort_index(ascending=False).items()]

sns.barplot(x=x, y=reviews['rating'].value_counts().sort_index(ascending=False).values)
# y축에 표시 없애기
plt.title('평점 분포')
plt.yticks([])
plt.show()


In [None]:
# '등록된 헤드라인이 없습니다' 포함한 headline 빈 문자열로 변경
# '등록된 리뷰내용이 없습니다' 포함한 review_content 빈 문자열로 변경
reviews['headline'] = np.where(reviews['headline'].str.contains('등록된 헤드라인이'), '', reviews['headline'])
reviews['review_content'] = np.where(reviews['review_content'].str.contains('등록된 리뷰내용이'), '', reviews['review_content'])

In [None]:
# headline, review_content 공백을 기준으로 합치기
reviews['content'] = reviews['headline'] + " " + reviews['review_content']
reviews = reviews[['rating', 'content']].reset_index(drop=True)

In [None]:
# 중복된 리뷰 제거
print(reviews.duplicated(subset=['rating', 'content']).sum())
print(len(reviews))
reviews.drop_duplicates(subset=['rating', 'content'], keep='first', inplace=True)
print(len(reviews))

In [None]:
!pip install konlpy
from konlpy.tag import Komoran, Okt, Kkma
import re

kkma = Kkma()
komoran = Komoran()
okt = Okt()

# 한글과 공백을 제외하고 모두 제거
def apply_regular_expression(text):
    hangul = re.compile('[^ ㄱ-ㅣ 가-힣]')
    result = hangul.sub('', text)
    return result

In [None]:
s = apply_regular_expression(reviews.iloc[63]['content'])
print(s)

# morphs : 형태소 추출
# nouns : 명사 추출

# kkma -> 속도 너무 느림
# komoran -> 속도 빠름, 하지만 명사를 너무 잘게 나눔 (ex. 배송 -> 배,송)
# okt -> 속도 빠름, 명사 추출에 좋은 성능을 보임

# 형태소 없이 명사만 추출해도 의미를 파악하는데 충분하다고 판단
# print(kkma.morphs(s))
# print(kkma.nouns(s))
# print()
# print(komoran.morphs(s))
# print(komoran.nouns(s))
# print()
# print(okt.morphs(s))
# print(okt.nouns(s))

In [None]:
# 한글자로 된 단어는 의미가 없으므로 제거
print([word for word in okt.nouns(s) if len(word) > 1])

In [None]:
# 빈도 분석
from collections import Counter

counter = Counter([word for word in okt.nouns(s) if len(word) > 1])

counter

In [None]:
# 불용어 제거
stopwords = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt").values.tolist()
print(len(stopwords))
print(stopwords[:10])

# 리뷰 데이터에 많이 포함된 불용어 추가
stopwords.extend(['쿠팡', '리뷰'])


In [None]:
from sklearn.model_selection import train_test_split

def split_rating(rating):
    if rating == 5:
        return 1
    if rating in [1,2,3]:
        return 0
    return -1

reviews['sentiment'] = reviews['rating'].apply(split_rating)
print(reviews['sentiment'].value_counts())

reviews_sample_positive = reviews[reviews['sentiment'] == 1].sample(15000, random_state=1353)
reviews_sample_negative = reviews[reviews['sentiment'] == 0].sample(15000, random_state=1353)
reviews_sample = pd.concat([reviews_sample_positive, reviews_sample_negative]).reset_index(drop=True)
print(reviews_sample['sentiment'].value_counts())
print(reviews_sample.shape)

train_x, temp_x, train_y, temp_y = train_test_split(reviews_sample['content'], reviews_sample['sentiment'], test_size=0.4, random_state=1353)
test_x, valid_x, test_y, valid_y = train_test_split(temp_x, temp_y, test_size=0.5, random_state=1353)

del temp_x, temp_y

print(train_x.shape, train_y.shape)
print(valid_x.shape, valid_y.shape)
print(test_x.shape, test_y.shape)

print(train_y.value_counts())
print(valid_y.value_counts())
print(test_y.value_counts())



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def text_cleaning(text):
    hangul = re.compile('[^ ㄱ-ㅣ 가-힣]')
    result = hangul.sub('', text)
    result = [word for word in okt.nouns(result) if len(word) > 1]
    result = [word for word in result if not word in stopwords]
    return result

cv = CountVectorizer(tokenizer = lambda x: text_cleaning(x))
cv.fit(train_x)
x_train_cv = cv.transform(train_x)
x_valid_cv = cv.transform(valid_x)
x_test_cv = cv.transform(test_x)

In [None]:
print(x_train_cv.toarray()) # 각 단어의 리뷰별 등장 횟수. row: 리뷰, column: 단어
print(x_train_cv.shape)
word_list = cv.get_feature_names_out() # 단어 리스트
count_list:np.ndarray = x_train_cv.toarray().sum(axis=0) # 각 단어의 빈도 리스트
print(len(word_list))
print(len(count_list))
print(word_list[count_list.argsort()[::-1]][:30]) # 빈도수가 높은 단어 30개
print(dict(zip(word_list, count_list))) # 단어별 빈도수 Dictonary

In [None]:
# TF-IDF
# TfidfVectorizer: 텍스트 데이터를 TF-IDF 행렬로 변환
# TfidfTransformer: 기존에 계산된 TF (Term Frequency) 행렬을 TF-IDF 행렬로 변환

from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(x_train_cv)
x_train_tfidf = tfidf_transformer.transform(x_train_cv)
x_valid_tfidf = tfidf_transformer.transform(x_valid_cv)
x_test_tfidf = tfidf_transformer.transform(x_test_cv)

In [None]:
print(x_train_tfidf.shape,end='\n\n') # (리뷰 개수, 단어 종류 개수)
print('BOW에서 단어의 중요도(0이 아닌 것만 출력)')
print(x_train_cv[0],end='\n\n')
print('TF-IDF에서 단어의 중요도(0이 아닌 것만 출력)')
print(x_train_tfidf[0])


In [None]:
# 로지스틱 회귀

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

params = {'C': [1], 'max_iter': [100]}
best_params = {}
best_score = 0

for C in params['C']:
    for max_iter in params['max_iter']:
        lr = LogisticRegression(C=C, max_iter=max_iter, random_state=1353)
        lr.fit(x_train_tfidf, train_y)
        pred_y = lr.predict(x_valid_tfidf)
        score = accuracy_score(valid_y, pred_y)

        print()
        print("=" * 30)
        print(">>>> accuracy-score: {}".format(accuracy_score(valid_y, pred_y)))
        print(">>>> precision-score: {}".format(precision_score(valid_y, pred_y)))
        print(">>>> recall-score: {}".format(recall_score(valid_y, pred_y)))
        print(">>>> f1-score: {}".format(f1_score(valid_y, pred_y)))
        print()
        print(">>>> Params: {}".format({ 'C': C, 'max_iter': max_iter}))
        print("=" * 30)
        print()

        if score > best_score:
            best_score = score
            best_params['C'] = C
            best_params['max_iter'] = max_iter
            print(">>>> Best Score Update: {}".format(best_score))
            print(">>>> Best Params Update: {}".format(best_params))




print("=" * 30)
print(">>>> Best Score: {}".format(best_score))
print(">>>> Best Params: {}".format(best_params))




In [None]:
# best_params
# test set으로 평가
from sklearn.metrics import confusion_matrix

lr = LogisticRegression(**best_params, random_state=1353)
lr.fit(x_train_tfidf, train_y)
pred_y = lr.predict(x_test_tfidf)
print(">>>> accuracy-score: {}".format(accuracy_score(test_y, pred_y)))
print(">>>> precision-score: {}".format(precision_score(test_y, pred_y)))
print(">>>> recall-score: {}".format(recall_score(test_y, pred_y)))
print(">>>> f1-score: {}".format(f1_score(test_y, pred_y)))

confusion_matrix(test_y, pred_y)

In [None]:
text = '내가 웬만하면 리뷰 안쓰려했는데 이건 좀 선넘은거 아닌가요? 진짜 열받네 ㄹㅇ'
text_cv = cv.transform([text])
text_tfidf = tfidf_transformer.transform(text_cv)
pred = lr.predict(text_tfidf)[0]
predict_proba = lr.predict_proba(text_tfidf)[0]
print(f"{round(predict_proba[0], 2)} 확률로 부정 리뷰입니다." if pred == 0 else f"{round(predict_proba[1], 2)} 확률로 긍정 리뷰입니다.")

In [None]:
!pip install seaborn matplotlib wordcloud


In [None]:

!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf


In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from wordcloud import WordCloud

mpl.rcParams['axes.unicode_minus'] = False


# Basic statistics of the DataFrame
print(reviews.info())

# Distribution of Ratings
plt.figure(figsize=(8, 5))
sns.countplot(x='rating', data=reviews)
plt.title('Distribution of Ratings')
plt.show()

# # Word Cloud for Positive and Negative Reviews
# positive_reviews = ' '.join(reviews[reviews['sentiment'] == 1]['content'])
# negative_reviews = ' '.join(reviews[reviews['sentiment'] == 0]['content'])

# # Word Cloud for Positive Reviews
# plt.figure(figsize=(12, 6))
# wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_reviews)
# plt.imshow(wordcloud_positive, interpolation='bilinear')
# plt.axis('off')
# plt.title('Word Cloud for Positive Reviews')
# plt.show()

# # Word Cloud for Negative Reviews
# plt.figure(figsize=(12, 6))
# wordcloud_negative = WordCloud(width=800, height=400, background_color='white').generate(negative_reviews)
# plt.imshow(wordcloud_negative, interpolation='bilinear')
# plt.axis('off')
# plt.title('Word Cloud for Negative Reviews')
# plt.show()

# Histogram of Review Lengths
reviews['review_length'] = reviews['content'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(reviews['review_length'], bins=50, kde=True)
plt.title('Distribution of Review Lengths')
plt.xlabel('Review Length')
plt.ylabel('Count')
plt.show()
