# import

In [100]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
assert gensim.models.word2vec.FAST_VERSION > -1
from gensim.models import Word2Vec
import datetime
import requests
import re
from bs4 import BeautifulSoup

# Data 준비하기

In [101]:
df_activity = pd.read_csv('activity_process.csv') # 대외활동 데이터
id_to_name = pd.Series(df_activity.company_name.values, index = df_activity.idx.values)
name_to_id = pd.Series(df_activity.idx.values, index = df_activity.company_name).to_dict()


df_activity.head(5)

Unnamed: 0.1,Unnamed: 0,idx,company_name,interests,경영컨설팅마케팅,경제금융,과학공학기술IT,교육,문화역사,뷰티미용화장품,...,우대역량_기타,취재/인터뷰/기사 경험,컴퓨터 활용능력,콘텐츠 기획/제작 경험,타 대외활동 경험,파워블로거/SNS,행사 기획 경험,휴학 중/시간 투자 가능,모집 인원,모집인원규모
0,0,53987,스카이(SKY),과학공학기술IT,0,0,1,0,0,0,...,0,1,0,1,1,1,0,0,99,중
1,1,53989,서주제과(주),요리식품,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,99,중
2,2,53990,이석영뉴미디어도서관,문화역사,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,9,소
3,3,53991,관세청,경제금융,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,99,중
4,4,53993,독산동우시장일대 도시재생지원센터,문화역사,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,99,중


# 필요없는 열제거

In [102]:
df_activity=df_activity.drop(["모집 인원"],axis='columns')
df_activity=df_activity.drop(["Unnamed: 0"],axis='columns')
df_activity=df_activity.drop(["interests"],axis='columns')
df_activity=df_activity.drop(["prefer"],axis='columns')
df_activity.head(3)

Unnamed: 0,idx,company_name,경영컨설팅마케팅,경제금융,과학공학기술IT,교육,문화역사,뷰티미용화장품,사회공헌교류,언론미디어,...,외국어,우대역량_기타,취재/인터뷰/기사 경험,컴퓨터 활용능력,콘텐츠 기획/제작 경험,타 대외활동 경험,파워블로거/SNS,행사 기획 경험,휴학 중/시간 투자 가능,모집인원규모
0,53987,스카이(SKY),0,0,1,0,0,0,0,0,...,0,0,1,0,1,1,1,0,0,중
1,53989,서주제과(주),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,중
2,53990,이석영뉴미디어도서관,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,소


# train set, test set 분리

In [103]:
df_activity_train, df_activity_test = train_test_split(df_activity, random_state = 15688, test_size=0.30)



In [104]:
df_activity_train.head(3)

Unnamed: 0,idx,company_name,경영컨설팅마케팅,경제금융,과학공학기술IT,교육,문화역사,뷰티미용화장품,사회공헌교류,언론미디어,...,외국어,우대역량_기타,취재/인터뷰/기사 경험,컴퓨터 활용능력,콘텐츠 기획/제작 경험,타 대외활동 경험,파워블로거/SNS,행사 기획 경험,휴학 중/시간 투자 가능,모집인원규모
2184,60711,충청남도,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,대
2357,61245,미래에셋금융서비스,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,중
7122,83478,서울ICT이노베이션스퀘어,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,소


# 같은 대외활동끼리 묶기

In [105]:
# 비교할 때 제외할 열들의 이름 리스트
def data_splitter(df):
    exclude_cols = ['idx', 'company_name']

    grouped_cols = [col for col in df_activity.columns if col not in exclude_cols]  # 제외할 열을 제외한 열들의 리스트

    df_grouped = df_activity.groupby(grouped_cols).apply(lambda x: x.idx.tolist())
    
    # 결과값을 str 타입으로 저장한 이중 배열
    array_of_groups = df_grouped.apply(lambda x: [str(val) for val in x]).tolist()
    return array_of_groups

In [106]:
pd.options.mode.chained_assignment = None
activity_ls = data_splitter(df_activity_train)

# 데이터 랜덤으로 섞고, 유효한 데이터 골라내기

In [107]:
for activity in activity_ls:
    random.shuffle(activity)

In [108]:
filtered_array = [group for group in activity_ls if len(group) > 1]

In [109]:
len(filtered_array)

963

# Model 생성 및 학습

In [218]:
model = Word2Vec(
    sentences = filtered_array, # 전처리된 리스트를 파라미터로 사용
    epochs = 70, # epoch
    min_count = 1, # 
    vector_size = 100, # hidden layer 사이즈
    sg = 1, # skip-gram을 사용하면 1
    hs = 0, # negative sampling을 사용하면 0
    negative = 5, # 0 이상일 경우 negative sampling을 사용함.
    window=5,  # 주변 단어 창 크기
    workers=4  # 사용할 CPU 스레드 수
)

model.save('item2vec_20230516')

In [219]:
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
model = Word2Vec.load('item2vec_20230516')
word_vectors = model.wv

# 추천

In [220]:
def recommender(positive_list=None, negative_list=None, topn=20):
    recommend_activity_ls = []
    
    for activityId, prob in model.wv.most_similar_cosmul(positive=positive_list, negative=negative_list, topn=topn):
        recommend_activity_ls.append(activityId)
    return recommend_activity_ls

In [221]:
reco = [64165,61367]
reco_str = [str(num) for num in reco]
recommandations = recommender(positive_list=reco_str, topn=5)
reco_int = [int(value) for value in recommandations] # str -> int
print(reco_int)

[55145, 64478, 80186, 64636, 61323]


In [222]:
# 입력 데이터
filtered_activity_names = df_activity.loc[df_activity['idx'].isin(reco), 'company_name'].tolist()
print(filtered_activity_names)

['(재)대전일자리경제진흥원', '신세계아이앤씨   ']


In [223]:
# 출력 데이터
filtered_activity_names = df_activity.loc[df_activity['idx'].isin(reco_int), 'company_name'].tolist()
print(filtered_activity_names)

['이티에듀   ', '웨시   ', '텐덤   ', '한국마이크로소프트', '사단법인 점프']


In [224]:
print(model.wv.most_similar(["64165","61367"]))

[('55145', 0.995672881603241), ('64478', 0.9955165982246399), ('80186', 0.9951454401016235), ('64636', 0.994850754737854), ('61323', 0.994835376739502), ('91688', 0.994789719581604), ('55543', 0.9947173595428467), ('64816', 0.9946165680885315), ('78690', 0.994437038898468), ('57506', 0.9942106604576111)]
