## Import

In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
assert gensim.models.word2vec.FAST_VERSION > -1
from gensim.models import Word2Vec
import datetime
import requests
import re
from bs4 import BeautifulSoup

## Data 준비하기

In [50]:
## < Data 준비하기 >
# 데이터 읽기
df_scholarship = pd.read_csv('scholarship.csv') # 장학금 데이터
df_contest = pd.read_csv('contest.csv') # 공모전 데이터

df_scholarship.head(3)

Unnamed: 0.1,Unnamed: 0,인덱스,재단,장학종류,근로장학,기숙사/주거지원,기타장학,대출지원,연수/연구장학,장학금,...,대학원생,전문대생,중학생,청년,초등학생,선발 인원(명),선발인원규모,장학혜택(만원),장학혜택규모,조회 수
0,0,5858,충청남도인재육성재단,"장학금, 기타장학",0,0,1,0,0,1,...,0,0,1,0,1,14,소,100,소,187
1,1,5859,당진시,대출지원,0,0,0,1,0,0,...,0,1,0,0,0,0,인원미정,0,금액미정,82
2,2,5860,안산시,장학금,0,0,0,0,0,1,...,0,1,0,0,0,0,인원미정,100,소,213


In [57]:
## < train set, test set 만들기 >
df_scholarship_train, df_scholarship_test = train_test_split(df_scholarship, random_state = 15688, test_size=0.30)
# df_contest_train, df_contest_test = train_test_split(df_contest, random_state = 15688, test_size=0.30)

# model input 형태: 리스트 안의 리스트 ([[], []] 형태)
scholarship_ls = df_scholarship_train.values.tolist()

## Model 생성 및 학습

In [60]:
# 모델 생성
model = Word2Vec(sentences = scholarship_ls, # 전처리된 리스트를 파라미터로 사용
                 epochs = 5, # epoch
                 min_count = 2, # 2회 이상 나타나야 유지
                 vector_size = 100, # hidden layer 사이즈
                 sg = 1, # skip-gram을 사용하면 1
                 hs = 0, # negative sampling을 사용하면 0
                 negative = 5, # 0 이상일 경우 negative sampling을 사용함.
                 window = 200)

model.save('item2vec_20230407')


# < 모델 불러오기 >
# warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

# model = Word2Vec.load('item2vec_20230407')
# word_vectors = model.wv

In [75]:
print(list(model.wv.index_to_key))

[0, 1, '소', '인원미정', '장학금', '금액미정', '중', '대학생', 100, '기타장학', '대', '대학생, 전문대생', '기타/일반', '기숙사/주거지원', '대학생, 대학원생', 10, '대출지원', 150, 2, 200, 20, 300, 400, 500, 50, '고등학생, 대학생', 49, '대학생, 전문대생, 대학원생', '고양시', '장학금, 기타장학', '청년, 기타/일반', 35, 86, '대학원생', 14, '충청남도인재육성재단', 187, 70, '경기도', 5, '한국장학재단', 74, 85, 90, '초등학생, 중학생, 고등학생', 82, 45, 56, 250, '농어촌희망재단', 4, 8, 18, 3, '근로장학', 15, 800, '한국고등교육재단', '경희대학교 국제캠퍼스', 80, 43, 55, 73, 24, '한국외국어대학교', '청년', 46]


## 추천

In [61]:
def recommender(positive_list=None, negative_list=None, topn=20):
    recommend_scholarship_ls = []
    
    for scholarshipId, prob in model.wv.most_similar_cosmul(positive=positive_list, negative=negative_list, topn=topn):
        recommend_scholarship_ls.append(scholarshipId)
    return recommend_scholarship_ls

In [78]:
test_ls = df_scholarship_train.values.tolist()
print(test_ls[23])

[35, 5893, '경희대학교 국제캠퍼스', '장학금', 0, 0, 0, 0, 0, 1, 0, '대학생', 0, 0, 1, 0, 0, 0, 0, 0, 0, '인원미정', 0, '금액미정', 192]


In [83]:
# recommender
# user로부터 'UP'이 검색되면 해당 검색어를 크롤링을 통해 dataset의 title값을 받아온다
# 이후 해당 movie title 리스트를 앞에서 인코딩한 name_to_movieId 사전을 통해 movieId로 바꿔준다
# 그리고 앞에서 정의한 모델을 통해 top-n 개의 similar한 데이터를 반환한다

ls = recommender(positive_list=['대출지원'], topn=5)
print(ls)

[0, '중', '대학생', '근로장학', 20]


## 모델 성능 평가

In [None]:
# def user_liked_movies_builder(model, df, for_prediction=False):
#     df['liked'] = np.where(df['rating'] >= 4, 1, 0)
#     df['movieId'] = df['movieId'].astype('str')
#     df_liked = df[df['liked'] == 1]
#     if for_prediction:
#         df_liked = df[df['movieId'].isin(model.wv.vocab.keys())]

#     user_liked_movies = df_liked.groupby('userId').agg({'movieId': lambda x: x.tolist()})['movieId'].to_dict()

#     return user_liked_movies


# def scores_at_m(model, user_liked_movies_test, user_liked_movies_training, topn=10):
#     sum_liked = 0
#     sum_correct = 0
#     sum_total = 0
#     common_users = set(user_liked_movies_test.keys()).intersection(set(user_liked_movies_training.keys()))

#     for userid in common_users:
#         current_test_set = set(user_liked_movies_test[userid])
#         pred = [pred_result[0] for pred_result in
#                 model.wv.most_similar_cosmul(positive=user_liked_movies_training[userid], topn=topn)]
#         sum_correct += len(set(pred).intersection(current_test_set))
#         sum_liked += len(current_test_set)
#     precision_at_m = sum_correct / (topn * len(common_users))
#     recall_at_m = sum_correct / sum_liked
#     f1 = 2 / ((1 / precision_at_m) + (1 / recall_at_m))
#     return [precision_at_m, recall_at_m, f1]

# pd.options.mode.chained_assignment = None
# user_liked_movies_train = user_liked_movies_builder(model, df_ratings_train, for_prediction=True)
# user_liked_movies_test = user_liked_movies_builder(model, df_ratings_test)

# model = Word2Vec.load('item2vec_20230407')
# model_score_sg1 = scores_at_m(model, user_liked_movies_test, user_liked_movies_train)
# del model

# print("Respectively, the [precision, recall, F-1 score] at 10 for our model are:")
# print(model_score_sg1)