In [1]:
import os
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from konlpy.tag import Okt
from typing import List
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from pydantic import BaseModel

In [63]:
okt = Okt()
df = pd.read_csv('./travel_spot_v1.csv')
df['tf'] = df['tagName'] + df['treatMenu']

In [64]:
spot_df = df[df['contentType'] != 39]
food_df = df[df['contentType'] == 39]

In [65]:
len(spot_df), len(food_df), len(df)

(4634, 5883, 10517)

In [4]:
vectorizer_spot = TfidfVectorizer()

tag_list = [' '.join([j for j in okt.morphs(i) if len(j) > 1]) for i in spot_df['tagName'].tolist()]
matrix_spot = vectorizer_spot.fit_transform(tag_list)

# 각 지역에 대한 TF-IDF 행렬 계산
city_matrices_spot = {}    
for city in spot_df['city'].unique():
    city_df = spot_df[spot_df['city'] == city]
    city_tag_list = [' '.join([j for j in okt.morphs(i) if len(j) > 1]) for i in city_df['tagName'].tolist()]
    city_matrices_spot[city] = vectorizer_spot.transform(city_tag_list)

In [5]:
vectorizer_food = TfidfVectorizer()

tag_list = [' '.join([j for j in okt.morphs(str(i)) if len(j) > 1]) for i in food_df['tf'].tolist()]
matrix_food = vectorizer_food.fit_transform(tag_list)

# 각 지역에 대한 TF-IDF 행렬 계산
city_matrices_food = {}    
for city in food_df['city'].unique():
    city_df = food_df[food_df['city'] == city]
    city_tag_list = [' '.join([j for j in okt.morphs(str(i)) if len(j) > 1]) for i in city_df['tf'].tolist()]
    city_matrices_food[city] = vectorizer_food.transform(city_tag_list)

In [6]:

# 저장
with open('travel_model_v2.pkl', 'wb') as file:
        pickle.dump({"spot":{
                "vectorizer": vectorizer_spot,
                "matrix": matrix_spot,
                "city_matrices": city_matrices_spot
        },"food":{
                "vectorizer": vectorizer_food,
                "matrix": matrix_food,
                "city_matrices": city_matrices_food
        }}, file)

In [7]:
with open('./travel_model_v2.pkl', 'rb') as file:
    loaded_data = pickle.load(file)
    
    vectorizer_spot = loaded_data["spot"]["vectorizer"]
    matrix_spot = loaded_data["spot"]["matrix"]
    city_matrices_spot = loaded_data["spot"]["city_matrices"]

    vectorizer_food = loaded_data["food"]["vectorizer"]
    matrix_food = loaded_data["food"]["matrix"]
    city_matrices_food = loaded_data["food"]["city_matrices"]

In [None]:
#     # 미리 계산된 해당 지역의 TF-IDF 행렬 사용
# if question.area == '전체':
#     city_tfidf_matrix = tfidf_matrix
# else:
#     city_tfidf_matrix = city_tfidf_matrices.get(question.area)
#     df = df[df['city']== f'{question.area}']

In [35]:
city_matrices_spot['충남']

<351x5323 sparse matrix of type '<class 'numpy.float64'>'
	with 5212 stored elements in Compressed Sparse Row format>

In [61]:
area = '인천'
question = '캠핑'

df = spot_df[spot_df['city']== f'{area}']

question_tfidf = vectorizer_spot.transform(okt.morphs(f'{question}'))

cos_similarities = cosine_similarity(question_tfidf, city_matrices_spot[f'{area}'])
sorted_indices = np.argsort(cos_similarities[0])[::-1][:5]
print('indices= ',len(sorted_indices))
print('df= ', len(df))
# 식당인 것, 아닌 

index = sorted_indices[0]



print(
str(df.iloc[index]['id']),
str(df.iloc[index]['city']),
str(df.iloc[index]['title']),
float(cos_similarities[0][index]),
str(df.iloc[index]['catchtitle']))



indices=  5
df=  156
2768200 인천 노가리해수욕장 0.20604354572803404 캠핑하며 차박하기 좋은 해변


In [70]:
area = '대구'
question = '피자'

df = food_df[food_df['city']== f'{area}']

question_tfidf = vectorizer_food.transform(okt.morphs(f'{question}'))

cos_similarities = cosine_similarity(question_tfidf, city_matrices_food[f'{area}'])
sorted_indices = np.argsort(cos_similarities[0])[::-1][:5]
print('indices= ',len(sorted_indices))
print('df= ', len(df))
# 식당인 것, 아닌 

index = sorted_indices[0]



print(
str(df.iloc[index]['id']),
str(df.iloc[index]['city']),
str(df.iloc[index]['title']),
float(cos_similarities[0][index]),
str(df.iloc[index]['catchtitle']))



indices=  5
df=  192
2866244 대구 루미너스 0.42204924949455175 모던한 분위기의 한우 안심 스테이크 맛집


In [14]:
np.argsort(cos_similarities[0])[::-1][0]

63

In [15]:
cos_similarities[0][63]

0.30805653331899097

In [27]:
df = pd.read_csv('./travel_spot_v1.csv', index_col=0)

In [29]:
okt = Okt()

In [35]:
question_area = '인천'
question_question = '효도여행'

# 미리 계산된 해당 지역의 TF-IDF 행렬 사용
if question_area == '전체':
    city_tfidf_matrix = tfidf_matrix
else:
    city_tfidf_matrix = city_tfidf_matrices.get(question_area)
    df = df[df['city']== f'{question_area}']

print('여기여기')

if city_tfidf_matrix is None:
    print(JSONResponse(content={"error": f"No data found for city: {question_area}"}, status_code=404))

# 질문과 선택된 지역의 TF-IDF로 유사도 계산
question_tfidf = tfidf_vectorizer.transform(okt.morphs(question_question))
cos_similarities = cosine_similarity(question_tfidf, city_tfidf_matrix)
sorted_indices = np.argsort(cos_similarities[0])[::-1]
len(sorted_indices)

여기여기


352

In [81]:
restaurant = sorted_indices[df.iloc[sorted_indices]['contentType'].eq(39)].tolist()[:5]
non_restaurant = sorted_indices[~df.iloc[sorted_indices]['contentType'].eq(39)].tolist()[:5]

restaurant,non_restaurant

([111, 112, 113, 114, 115], [351, 350, 296, 288, 289])

In [62]:
len(df), len(sorted_indices)

(352, 352)

In [60]:
len(sorted_indices)

352

In [52]:

print('여기여기2')

from itertools import islice
# 식당인 것, 아닌 것
restaurant = islice((index for index in sorted_indices if df.iloc[index]['contentType'] == 39 ), 5)
non_restaurant = islice((index for index in sorted_indices if df.iloc[index]['contentType'] != 39 ), 5)

list(restaurant), list(non_restaurant)

여기여기2


([111, 112, 113, 114, 115], [351, 350, 296, 288, 289])

In [51]:
str(df.iloc[111]['city'])

'인천'

In [55]:
str(df.iloc[111]['id'])

'2758144'

In [53]:
def add_to_similar_tags(sorted_indices,similar_tags):
    bag = []
    for index in sorted_indices:
        print("Current index:", index)
        bag.append({
            "id": str(df.iloc[index]['id']),
            "area": str(df.iloc[index]['city'])})
        similar_tags.append(bag)

In [78]:
restaurant


<itertools.islice at 0x7f1b75d94950>

In [57]:

# 결과를 저장할 리스트 초기화
similar_tags = []
print('여기여기 3')
# 리스트로 추가
def add_to_similar_tags(sorted_indices,similar_tags):
    bag = []
    for index in sorted_indices:
        print("Current index:", index)
        bag.append({
            "id": str(df.iloc[index]['id']),
            "area": str(df.iloc[index]['city']),
            "title": str(df.iloc[index]['title']),
            "similarity": float(cos_similarities[0][index]),
            "catchtitle": str(df.iloc[index]['catchtitle']),
            # "detail": str(df.iloc[index]['detail']),
            "treatMenu": str(df.iloc[index]['treatMenu']),
            "tagName": str(df.iloc[index]['tagName']),
            "addr": str(df.iloc[index]['addr']),
            "info": str(df.iloc[index]['info']),
            # "lat": str(df.iloc[index]['parking']),
            "useTime": str(df.iloc[index]['useTime']),
            "conLike": str(df.iloc[index]['conLike']),
            "conRead": str(df.iloc[index]['conRead']),
            "conShare": str(df.iloc[index]['conShare']),
            # "overView": str(df.iloc[index]['overView']),
            "lat": str(df.iloc[index]['lat']),
            "lon": str(df.iloc[index]['lon'])
            
            })
    similar_tags.append(bag)
print('여기여기4')

# 함수를 사용
add_to_similar_tags(non_restaurant,similar_tags)
add_to_similar_tags(restaurant,similar_tags)
similar_tags

여기여기 3
여기여기4


[[], []]

In [76]:
sorted_indices.__len__()

352

In [73]:
# 결과를 저장할 리스트 초기화
similar_tags = []

def add_to_similar_tags(sorted_indices):
    bag = []
    sorted_indices = list(sorted_indices)[:len(df)]
    print("sorted_indices length:", len(sorted_indices))
    for index in sorted_indices:
        print("Processing index:", index)
        if 0 <= index < len(df):
            print("Adding to bag:", index)
            bag.append({
                "id": str(df.iloc[index]['id']),
                "area": str(df.iloc[index]['city']),
                "title": str(df.iloc[index]['title']),
                "similarity": float(cos_similarities[0][index]),
                "catchtitle": str(df.iloc[index]['catchtitle']),
                "treatMenu": str(df.iloc[index]['treatMenu']),
                "tagName": str(df.iloc[index]['tagName']),
                "addr": str(df.iloc[index]['addr']),
                "info": str(df.iloc[index]['info']),
                "useTime": str(df.iloc[index]['useTime']),
                "conLike": str(df.iloc[index]['conLike']),
                "conRead": str(df.iloc[index]['conRead']),
                "conShare": str(df.iloc[index]['conShare']),
                "lat": str(df.iloc[index]['lat']),
                "lon": str(df.iloc[index]['lon'])
            })
        else:
            print("Index out of bounds:", index)
    print("Adding bag to similar_tags:", bag)
    similar_tags.append(bag)

# 함수를 사용
add_to_similar_tags(non_restaurant)
add_to_similar_tags(restaurant)

print("Final similar_tags:", similar_tags)


sorted_indices length: 0
Adding bag to similar_tags: []
sorted_indices length: 0
Adding bag to similar_tags: []
Final similar_tags: [[], []]
