In [None]:
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import string

# 모델1

In [None]:
df = pd.read_csv("preprocessing_data/wine_newname_final.csv")

In [None]:
df.head()

### 불용어 제거

In [None]:
nltk.download('stopwords')  #NLTK를 통해 불용어 확인
stopwords = nltk.corpus.stopwords.words('english')

### 와인 이름 받아서 index 추출 => 제일 앞에 있는 와인 추출

In [None]:
wine_name = input("와인 이름을 입력해주세요: ")

In [None]:
new_name = string.capwords(wine_name)
new_name

In [None]:
df[df['name'].str.contains(new_name)==True]

In [None]:
# 인덱스 찾기
wine_index =df[df['name'].str.contains(new_name)==True].index[0]
wine_index

In [None]:
df[df['name'].str.contains(wine_name)==True]

### 코사인유사도 계산 (리뷰, 음식)

In [None]:
def cosine_func(col_name,wine_index):
    
    #정규 표현식으로 영문만 남음
    only_english  = [ re.sub('[^a-zA-Z]', ' ',sentence).lower() for sentence in df[col_name]]
    #토큰만들기
    col_tokenized = [ nltk.word_tokenize(item) for item in only_english ] 
    #불용어 빼기
    no_stopwords = [ i for i in col_tokenized if i not in stopwords]
    #다시 합쳐주기
    final_review = [ ' '.join(item) for item in no_stopwords ]
    
    #Tfidf
    tfidf_vect = TfidfVectorizer()
    feature_vect = tfidf_vect.fit_transform(final_review)
    
    #코사인 유사도 분석 => 사용자가 입력한 와인과 전체 
    similarity_simple_pair = cosine_similarity(feature_vect[wine_index], feature_vect)
    result_list = similarity_simple_pair.tolist()[0]
    df[f'{col_name}_result'] = result_list
    
    return df

In [None]:
# 데이터프레임에 유사도 추가
my_col = ['foods','re']
for target_col in my_col:
    cosine_func(target_col,wine_index)

In [None]:
df.head()

In [None]:
result_arr = []
for column in df.columns.values:
    if (df[column].dtype == 'float64') |  (df[column].dtype == 'int64'):
        if(column != 'wine_id'):
            result_arr.append(column)

In [None]:
result_arr = result_arr[3:]
result_arr

In [None]:
df_result_weight = df[result_arr].copy()

In [None]:
df_result_weight.head()

#### 가중치 받기

In [None]:
price_w = float(input("가중치를 입력해주세요.(합 1) \n가격 : "))
score_w = float(input("평점 : "))
alcohol_w = float(input("도수 : "))
food_w = float(input("어울리는 음식: "))
review_w = float(input("리뷰: "))

In [None]:
# how == 1, low == 2

how_alcohol = int(input("도수 선호도(high:1 low:2) : "))
how_price = int(input("가격 선호도(high:1 low:2) : "))

#### 받은 가중치로 계산

In [None]:
# 가중치 추가
if how_price == 1:
    df['weight'] = df_result_weight['s_price']*price_w
else:
    df['weight'] = df_result_weight['rev_s_price']*price_w   
df['weight']+=df_result_weight['s_score']*score_w
if how_alcohol == 1:
    df['weight']+=df_result_weight['s_alcohol']*alcohol_w
else:
    df['weight']+=df_result_weight['rev_s_alcohol']*alcohol_w
df['weight']+=df_result_weight['foods_result']*food_w
df['weight']+=df_result_weight['re_result']*review_w 

In [None]:
df['weight']

#### 상위 6개 추천

In [None]:
df_sorted_by_values = df.sort_values(by='weight' ,ascending=False)

In [None]:
result = df_sorted_by_values[:6]

In [None]:
result

# 모델 2

In [None]:
df = pd.read_csv("minmaxscaling_wine.csv",index_col=[0])

In [None]:
country_select = df['country'].value_counts().head(15).keys().to_list()
country_select

In [None]:
grape_select = df['grapes'].value_counts().head(15).keys().to_list()

vivino_grapes = ['Cabernet Sauvignon', 'Merlot', 'Chardonnay', 'Pinot Noir', 'Malbec',
'Sauvignon Blanc', 'Shiraz/Syrah', 'Zinfandel', 'Nebbiolo', 'Sangiovese',
'Pinot Grigio', 'Riesling', 'Chenin Blanc', 'Moscato', 'Albarino']
for i in range(len(vivino_grapes)):
    if vivino_grapes[i] not in grape_select:
        grape_select.append(vivino_grapes[i])
grape_select

In [None]:
len(grape_select)

In [None]:
alcohol_min = float(input("알코올 도수 최솟값을 입력해주세요!! : "))

In [None]:
alcohol_max = float(input("알코올 도수 최댓값을 입력해주세요!! : "))

In [None]:
price_min = float(input("가격 최솟값을 입력해주세요!! : "))

In [None]:
price_max = float(input("가격 최댓값을 입력해주세요!! : "))

In [None]:
food_name = input("같이 먹을 음식을 입력해주세요!! : ")

In [None]:
review_tagname = input("이번 파티의 핵심 단어를 입력해주세요!! : ")

In [None]:
grape_name = input("포도품종을 입력해주세요!! : ")

In [None]:
country_name = input("선호하는 나라가 있나요 : ")

In [None]:
df = df[(df['alcohol'] >= alcohol_min) & (df['alcohol'] <= alcohol_max)]
df = df[(df['price'] >= price_min) & (df['price'] <= price_max) ]

In [None]:
df = df[(df['grapes'] == grape_name )]
df = df[(df['country'] == country_name )]

In [None]:
df.head()

### 불용어 제거

In [None]:
nltk.download('stopwords') #NLTK를 통해 불용어 확인
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def cosine_func(col_name,target_data):
    
    #정규 표현식으로 영문만 남음
    only_english1  = [ re.sub('[^a-zA-Z]', ' ',sentence).lower() for sentence in df[col_name]]
    only_english2  = re.sub('[^a-zA-Z]', ' ',target_data).lower()
    #토큰만들기
    col_tokenized1 = [ nltk.word_tokenize(item) for item in only_english1 ] 
    col_tokenized2 = nltk.word_tokenize(only_english2)
    #불용어 빼기
    no_stopwords1 = [ i for i in col_tokenized1 if i not in stopwords]
    no_stopwords2 = [ i for i in col_tokenized2 if i not in stopwords]
    #다시 합쳐주기
    final_review1 = [ ' '.join(item) for item in no_stopwords1 ]
    final_review2 = [ ' '.join(item) for item in no_stopwords2 ]
#     final_review = nltk.Text(no_stopwords) # corpus 생성
    
    #Tfidf
    tfidf_vect = TfidfVectorizer()
    feature_vect = tfidf_vect.fit_transform(final_review1)
    my_vect = tfidf_vect.transform(final_review2)
    
    #코사인 유사도 분석 => 사용자가 입력한 와인과 전체 
    similarity_simple_pair = cosine_similarity(my_vect, feature_vect)
    result_list = similarity_simple_pair.tolist()[0]
    df[f'{col_name}_result'] = result_list
    
    return df

In [None]:
# 데이터프레임에 유사도 추가
my_col = ['foods','re']
my_stan = [food_name, review_tagname]
for target_col, target_data in zip(my_col, my_stan):
    cosine_func(target_col,target_data)

In [None]:
result_arr = []
for column in df.columns.values:
    if (df[column].dtype == 'float64') |  (df[column].dtype == 'int64'):
        if(column != 'wine_id'):
            result_arr.append(column)

In [None]:
result_arr

In [None]:
df_result_weight = df[result_arr].copy()
df_result_weight.head()

#### 가중치 받기

In [None]:
num1 = int(input("평점 : "))
num2 = int(input("같이 먹을 음식 : "))
num3 = int(input("리뷰 : "))

In [None]:
def weight_cal(num):
    if num == 1:
        w = 2.0
    elif num == 2 :
        w=1.5
    else:
        w=1
    return w

#### 받은 가중치로 계산

In [None]:
# 가중치 추가  => 우선 순위 받지 않은 가중치값은 아직 정하지 않음!

df['weight'] = df_result_weight.iloc[:,0]*0
df['weight'] += df_result_weight.iloc[:,1]*weight_cal(num1)
df['weight'] += df_result_weight.iloc[:,2]*0.5
df['weight'] += df_result_weight.iloc[:,3]*0.5
df['weight'] += df_result_weight.iloc[:,4]*weight_cal(num2)
df['weight'] += df_result_weight.iloc[:,5]*weight_cal(num3)

In [None]:
df['weight']

In [None]:
df_sorted_by_values = df.sort_values(by='weight' ,ascending=False)

In [None]:
df_sorted_by_values[:5]