In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt

# 데이터 로드
df = pd.read_csv('wine_review.csv')

# 데이터 탐색
print(df.head())
print(df.info())
print(df.describe())

                     id asins            brand  \
0  AV13ClKCGV-KLJ3akN68   NaN            Gallo   
1  AV13CsvW-jtxr-f38AQO   NaN  Fresh Craft Co.   
2  AV13CVI_glJLPUi8O7Po   NaN     1000 Stories   
3  AV13CVI_glJLPUi8O7Po   NaN     1000 Stories   
4  AV13CYL4-jtxr-f37_-t   NaN     Wine Cube153   

                                          categories             dateAdded  \
0  Food & Beverage,Beverages,Wine, Beer & Liquor,...  2017-07-24T23:59:11Z   
1  Food & Beverage,Beverages,Wine, Beer & Liquor,...  2017-07-24T23:59:42Z   
2  Food & Beverage,Beverages,Wine, Beer & Liquor,...  2017-07-24T23:58:05Z   
3  Food & Beverage,Beverages,Wine, Beer & Liquor,...  2017-07-24T23:58:05Z   
4  Food & Beverage,Beverages,Wine, Beer & Liquor,...  2017-07-24T23:58:18Z   

            dateUpdated                                       descriptions  \
0  2018-01-10T18:06:28Z                                                NaN   
1  2018-01-10T05:38:33Z  [{"dateSeen":["2017-12-21T05:43:00.000Z","2017...

리뷰 텍스트와 평점이 있는 열만 사용하고, 결측값을 처리한다.
 데이터 정리 (노이즈 제거, 결측값 처리)

In [19]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd

# NLTK 데이터 다운로드
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 데이터 불러오기
df = pd.read_csv('wine_review.csv')

# 필요한 열만 선택
df = df[['reviews.text', 'reviews.rating']]

# 결측치 제거
df.dropna(subset=['reviews.text', 'reviews.rating'], inplace=True)

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


텍스트 토큰화 및 불용어 제거, 텍스트 정규화 (스테밍 또는 표제어 추출)

In [22]:
# 불용어 설정 및 표제어 추출기 초기화
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # 소문자 변환
    text = text.lower()
    
    # 숫자 제거
    text = re.sub(r'[0-9]', '', text)
    
    # 구두점 제거
    text = re.sub(r'[^\w\s]', '', text)
    
    # 단어 토큰화
    tokens = word_tokenize(text)
    
    # 불용어 제거 및 표제어 추출
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # 공백으로 구분된 문자열로 재결합
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# 'reviews.text' 열에 대해 전처리 함수 적용하여 'processed_text' 열 생성
df['processed_text'] = df['reviews.text'].apply(preprocess_text)

# 전처리된 데이터를 cleaned_wine_review.csv 파일로 저장
df.to_csv('cleaned_wine_review.csv', index=False)

# 전처리된 데이터 확인
print(df[['reviews.text', 'processed_text']].head())

                                        reviews.text  \
0      This a fantastic white wine for any occasion!   
1   Tart, not sweet...very refreshing and delicious!   
2  I was given this wine so it was a delightful s...   
3  This is a phenomenal wine and my new favorite ...   
4  4 750ml bottles for the price of two With way ...   

                                      processed_text  
0                      fantastic white wine occasion  
1                tart sweetvery refreshing delicious  
2  given wine delightful surprise find flavorful ...  
3                   phenomenal wine new favorite red  
4  ml bottle price two way le packaging yes pleas...  
