In [1]:
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import BertModel, BertTokenizer
import numpy as np
from keybert import KeyBERT
from transformers import BertModel
import torch
import math
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from kobert_transformers import get_tokenizer
import re
from collections import Counter
from ITglossary import ITGlossary, ITGlossaryUpdater
import pymysql
import pandas as pd
from keybert import KeyBERT
from kiwipiepy import Kiwi
import urllib.request
from soyspacing.countbase import RuleDict, CountSpace
import json
from decimal import Decimal
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware


app = FastAPI()



  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ModuleNotFoundError: No module named 'keybert'

In [None]:
# CORS 설정
app.add_middleware(
    CORSMiddleware,
    allow_origins=["http://localhost:3000", "https://ohmystack.co"],  # 클라이언트의 도메인 주소를 여기에 추가
    allow_methods=["POST"],  # 요청 허용 메서드
    allow_headers=["*"],  # 요청 허용 헤더
)

# Load the job data
conn = pymysql.connect(host="database-1.cb6dvhektjhd.ap-northeast-2.rds.amazonaws.com", user='gihun', password='EM7E7e', db='production')



In [None]:
@app.on_event("startup")
def load_data():
    global df
    global glossary
    global kw_model

    query = "SELECT * FROM production.cosine"
    with conn.cursor() as cursor:
        df = pd.read_sql_query(query, conn)

    glossary = ITGlossary()
    updater = ITGlossaryUpdater(glossary)
    updater.update_glossary()
    glossary.print_glossary()

    model = BertModel.from_pretrained('skt/kobert-base-v1')
    kw_model = KeyBERT(model)



In [None]:
@app.on_event("shutdown")
def close_connection():
    conn.close()



In [None]:
# Define request/response models
class JobRecommendationRequest(BaseModel):
    self_intr: str

class JobRecommendationResponse(BaseModel):
    job_recommendations: list

# Define utility functions
def clean_text(input_text):
    if not isinstance(input_text, str):
        return input_text
    input_text = input_text.lower()
    # Remove special characters
    cleaned_text = re.sub(r'[^\w\s]', '', input_text)
    cleaned_text = re.sub(r'\([^)]*\)', '', cleaned_text)
    cleaned_text = re.sub(r'\r', '', cleaned_text)
    # Remove newline characters
    cleaned_text = cleaned_text.replace('\n', '')
    # Remove HTML tags
    cleaned_text = re.sub(r'<[^>]+>', '', cleaned_text)
    # Remove "www" or "www~" from the text
    cleaned_text = re.sub(r'www~?.(\w+.)+\w+', '', cleaned_text)
    # Remove "http~" from the text
    cleaned_text = re.sub(r'http~?', '', cleaned_text)
    return cleaned_text

def remove_english(text):
    pattern = re.compile(r'[a-zA-Z]+')
    text_without_english = re.sub(pattern, '', text)
    return text_without_english

def translate_IT(input_text, it_glossary):
    input_text = str(input_text)
    words = re.findall(r'\b[a-zA-Z]+\b', input_text)
    for word in words:
        try:
            input_text = input_text.replace(word, it_glossary[word])
        except:
            pass
        input_text = input_text.replace('[', '')
        input_text = input_text.replace(']', '')
    return input_text

def get_embedding(kw_model, keywords, self_intr):
    tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
    embeddings = []  # 임베딩 리스트
    weights = []  # 가중치 리스트
    for keyword in keywords:
        if len(keyword) > 1:  # 키워드의 길이가 1보다 큰지 확인
            weights.append(keyword[1])  # 가중치를 리스트에서 가져옴
            with torch.no_grad():
                model = BertModel.from_pretrained('monologg/kobert')
                input_ids = tokenizer.encode(keyword[0], add_special_tokens=True)
                input_ids = torch.tensor(input_ids).unsqueeze(0)
                last_hidden_states = model(input_ids)[0]
                embedding = last_hidden_states[0].mean(dim=0).numpy()
                embeddings.append(embedding)
    if not weights or all(weight == 0 for weight in weights):
        print("가중치 Null값이 발생하였습니다.")
        return None
    return np.average(embeddings, axis=0, weights=weights)

class NumpyJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (np.float16, np.float32, np.float64)):
            if np.isnan(obj) or np.isinf(obj):
                return str(obj)
            return float(obj)
        if isinstance(obj, (np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64)):
            return int(obj)
        if isinstance(obj, Decimal):
            return float(obj)
        return super().default(obj)




In [None]:
@app.post("/job_recommendation", response_model=JobRecommendationResponse)
def job_recommendation(request: JobRecommendationRequest):
    self_intr = request.self_intr
    self_intr = translate_IT(clean_text(self_intr), glossary)
    self_intr = remove_english(self_intr)
    self_intr = clean_text(self_intr)
    
    keywords = kw_model.extract_keywords(self_intr, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=10)
    embedding = get_embedding(kw_model, keywords, self_intr)
    embedding2 = []

    for s in df["embedding"]:
        if s is None:
            embedding2.append(np.zeros((768,), dtype=float).tolist())
            continue

        s = s.replace('\n', '').replace('[', '').replace(']', '')
        s = re.split(r'\s+', s.strip())

        if len(s) == 0:
            embedding2.append(np.zeros((768,), dtype=float).tolist())
            continue

        arr = np.array(s, dtype=float)
        arr[np.isnan(arr)] = 0  # NaN 값을 0으로 대체
        arr[np.isinf(arr)] = 0  # Infinity 값을 0으로 대체
        arr = arr.reshape((768,))

        embedding2.append(arr.tolist())


    cos_sim = [cosine_similarity([embedding], [emb])[0][0] for emb in embedding2]
    cos_sim = np.array(cos_sim)
    cos_sim[np.isnan(cos_sim)] = 0

    top_indices = np.argpartition(cos_sim, -10)[-10:]
    job_recommendations = df.iloc[top_indices].to_dict(orient='records')

    response_content = json.dumps({"job_recommendations": job_recommendations}, cls=NumpyJSONEncoder)
    return JSONResponse(content=response_content, media_type="application/json")
