# Connect to google account

In [106]:
# 구글 드라이브 연결
from google.colab import drive
drive.mount("/content/data")

Drive already mounted at /content/data; to attempt to forcibly remount, call drive.mount("/content/data", force_remount=True).


In [107]:
def recall5(answer_df, submission_df):
    """
    Calculate recall@5 for given dataframes.

    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions

    Returns:
    - recall: Recall@5 value
    """

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # submission의 예측이 각각 5개인지 확인
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # submission의 예측된 값들에 null값이 있는지 확인
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # 예측값에 중복이 있는지 확인
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # primary_col 즉 resume_seq가 양측에 있는지 확인 후 남김
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [108]:
# 모듈 Import
import numpy as np # 행렬 계산에 사용하는 모듈
import pandas as pd # 데이터 처리와 분석을 위한 모듈
import matplotlib.pyplot as plt # 데이터 시각화를 위한 모듈. 2D, 3D 그릴 때 사용
import seaborn as sns # 데이터 시각화를 위한 모듈. 두 데이터의 관계를 볼때 사용
from tqdm.auto import tqdm
from collections import defaultdict
from sklearn.decomposition import TruncatedSVD, NMF, SparsePCA
from sklearn.metrics.pairwise import cosine_similarity

In [109]:
# 딕셔너리를 간편하게 사용하는 것을 도와준다. 모델 class 처럼 사용 가능
# dot을 이용해 객체를 불러 사용. JSON 다룰때 유용.
import easydict
args = easydict.EasyDict()

# path
args.default_path = "/content/data/MyDrive/Playdata/Competitions/ML/Dacon/" # 메인 경로
args.apply_train_path = args.default_path + "apply_train.csv" # train 데이터 경로
args.company_path = args.default_path + "company.csv"
args.recruitment_path = args.default_path + "recruitment.csv"
args.resume_certificate_path = args.default_path + "resume_certificate.csv"
args.resume_education_path = args.default_path + "resume_education.csv"
args.resume_language_path = args.default_path + "resume_language.csv"
args.resume_path = args.default_path + "resume.csv"

args.default_submission_path = args.default_path + "sample_submission.csv" # 예측결과(제출파일) 경로

# 데이터 분석을 위한 변수들
# # 난수 생성 제어 => 같은 코드를 실행해도 동일한 결과를 얻기 위해서 설정
# 데이터 분할 및 모델 초기화 때 유용.
args.random_state = 42
args.results = [] # 결과 저장 리스트

In [110]:
args.submission_path = args.default_path + "result/submission_Model_2.csv" # 결과 저장 파일
args.save_results = args.default_path+"result/model_results_Model_2.json" # 결과 저장 json

# File Load

In [111]:
apply_train_df = pd.read_csv(args.apply_train_path) # apply_train -> DataFrame화
company_df = pd.read_csv(args.company_path) # company -> DataFrame화
recruitment_df = pd.read_csv(args.recruitment_path) # recruitment -> DataFrame화
resume_certificate_df = pd.read_csv(args.resume_certificate_path) # resume_certificate -> DataFreame화
resume_education_df = pd.read_csv(args.resume_education_path) # resume_education -> DataFrame화
resume_language_df = pd.read_csv(args.resume_language_path) # resume_language -> DataFrame화
resume_df = pd.read_csv(args.resume_path) # resume -> DataFrame화


In [112]:
apply_train = apply_train_df.copy()
company = company_df.copy()
recruitment = recruitment_df.copy()
resume_certificate = resume_certificate_df.copy()
resume_education = resume_education_df.copy()
resume_language = resume_language_df.copy()
resume = resume_df.copy()

# 탐색

### recruitment_seq 기준 합치기

In [113]:
company = company.sort_values(by = "recruitment_seq")
recruitment = recruitment.sort_values(by = "recruitment_seq")

In [114]:
merged_recruitment = pd.merge(recruitment, company, on='recruitment_seq', how='left')
merged_recruitment.set_index('recruitment_seq', inplace=True)
merged_recruitment

Unnamed: 0_level_0,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,qualifications,text_keyword,company_type_seq,supply_kind,employee
recruitment_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
R00001,5.0,,,0,0,2101;2108;2201;2204;2205;2707;2810,2,2,1,,5.0,201.0,631.0
R00002,3.0,,,0,0,2507;2703;2707,3,2,1,,2.0,201.0,160.0
R00003,3.0,,,0,0,2101;2108;2201;2707,3,2,2,,,,
R00004,3.0,,,0,0,2507;2707,3,2,1,,2.0,402.0,500.0
R00005,3.0,,,0,0,2507;2707,3,2,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
R06691,3.0,,,0,0,2501;2507;2707,3,2,1,,,,
R06692,3.0,,,0,0,2201;2507,3,2,2,,4.0,402.0,150.0
R06693,5.0,,,0,0,2102;2707,4,2,1,,,,
R06694,3.0,,,0,0,2101;2108;2109;2110;2201;2203;2707,4,2,1,,,,


### resume_seq 기준 합치기

In [115]:
resume = resume.sort_values(by='resume_seq')

resume_certificate = resume_certificate.sort_values(by='resume_seq')
resume_certificate = resume_certificate.dropna(subset=['certificate_contents'])
resume_certificate = resume_certificate.groupby('resume_seq')['certificate_contents'].apply(';'.join).reset_index()

resume_education = resume_education.sort_values(by='resume_seq')

resume_language = resume_language.sort_values(by='resume_seq')
resume_language['lang_exam_score'] = resume_language['language'].astype(str) + ';' + resume_language['exam_name'].astype(str) + ';' + resume_language['score'].astype(str)
resume_language = resume_language.drop(['language','exam_name', 'score', 'score'], axis=1)
resume_language = resume_language.dropna(subset=['lang_exam_score'])
resume_language = resume_language.groupby('resume_seq')['lang_exam_score'].apply('&'.join).reset_index()

In [116]:
merged_resume = pd.merge(resume, resume_certificate, on='resume_seq', how='left')
merged_resume = pd.merge(merged_resume, resume_education, on='resume_seq', how='left')
merged_resume = pd.merge(merged_resume, resume_language, on='resume_seq', how='left')
merged_resume.set_index('resume_seq', inplace=True)
merged_resume

Unnamed: 0_level_0,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword,job_code_seq1,job_code_seq2,job_code_seq3,...,hischool_location_seq,univ_type_seq1,univ_type_seq2,univ_transfer,univ_location,univ_major,univ_sub_major,univ_major_type,univ_score,lang_exam_score
resume_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,,...,4,5,5,0,17,,,9,20.0,
U00002,2020-04-24,2020-04-29,4,2005,0.0,0.0,디자이너,재료·화학·섬유·의복,,,...,0,5,5,0,3,,,8,90.0,
U00003,2018-02-14,2020-07-08,4,2004,0.0,0.0,남성복디자이너;TD캐주얼,재료·화학·섬유·의복,,,...,6,5,5,0,17,,,4,90.0,
U00004,2017-10-26,2020-04-27,4,2011,0.0,0.0,상품기획;영업기획,재료·화학·섬유·의복,,,...,5,5,5,0,5,,,3,70.0,2;11;661.62
U00005,2020-03-18,2020-04-08,4,2011,0.0,0.0,인사;총무;경영;MD;상품기획,재료·화학·섬유·의복,,,...,5,0,0,0,0,,,10,50.0,2;4;873.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U08478,2020-02-05,2020-03-31,4,2003,0.0,2700.0,상품기획,재료·화학·섬유·의복,,,...,3,5,5,0,5,,,19,0.0,
U08479,2016-02-17,2020-07-15,3,0,2500.0,2300.0,디자이너,재료·화학·섬유·의복,,,...,3,3,6,0,3,가정과,,9,0.0,
U08480,2019-11-13,2020-06-24,4,0,0.0,0.0,VMD;텍스타일디자인;섬유디자인;니트디자인;여성복디자인,디자인,,,...,0,5,5,0,10,미술대학섬유패션코디네이션학과,,9,80.0,
U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,,...,5,5,5,0,5,,,9,60.0,


In [117]:
apply_train.shape

(57946, 2)

- merged_recruitment 내용 확인

In [87]:
merged_recruitment.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6695 entries, R00001 to R06695
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   address_seq1       6694 non-null   float64
 1   address_seq2       100 non-null    float64
 2   address_seq3       9 non-null      float64
 3   career_end         6695 non-null   int64  
 4   career_start       6695 non-null   int64  
 5   check_box_keyword  6695 non-null   object 
 6   education          6695 non-null   int64  
 7   major_task         6695 non-null   int64  
 8   qualifications     6695 non-null   int64  
 9   text_keyword       707 non-null    object 
 10  company_type_seq   2377 non-null   float64
 11  supply_kind        2377 non-null   float64
 12  employee           2377 non-null   float64
dtypes: float64(6), int64(5), object(2)
memory usage: 732.3+ KB


In [118]:
merged_recruitment.describe()

Unnamed: 0,address_seq1,address_seq2,address_seq3,career_end,career_start,education,major_task,qualifications,company_type_seq,supply_kind,employee
count,6694.0,100.0,9.0,6695.0,6695.0,6695.0,6695.0,6695.0,2377.0,2377.0,2377.0
mean,3.103675,7.9,16.555556,0.0,0.0,3.081703,2.277521,1.317401,2.944468,344.090029,217.696676
std,0.955459,7.399836,6.424778,0.0,0.0,0.753074,1.242267,0.477852,1.739406,131.232212,331.153073
min,1.0,1.0,3.0,0.0,0.0,2.0,1.0,0.0,2.0,100.0,0.0
25%,3.0,1.0,18.0,0.0,0.0,3.0,2.0,1.0,2.0,402.0,41.0
50%,3.0,5.0,20.0,0.0,0.0,3.0,2.0,1.0,2.0,402.0,145.0
75%,3.0,14.0,20.0,0.0,0.0,4.0,2.0,2.0,4.0,402.0,300.0
max,20.0,20.0,20.0,0.0,0.0,6.0,10.0,2.0,9.0,605.0,12223.0


- address : 범주
- career : 다 0?
- education : 범주
- major_task : 범주
- qualifications : 범주
- company_type_seq : 범주
- supply_kind : 수치
- employee : 수치

In [119]:
(merged_recruitment.isnull().sum() / len(merged_recruitment)).sort_values()

career_end           0.000000
career_start         0.000000
check_box_keyword    0.000000
education            0.000000
major_task           0.000000
qualifications       0.000000
address_seq1         0.000149
company_type_seq     0.644959
supply_kind          0.644959
employee             0.644959
text_keyword         0.894399
address_seq2         0.985063
address_seq3         0.998656
dtype: float64

- address_seq1 : mode
- address_seq 2 ~ 3 : drop
- text_keyword : drop?
- company_type_seq, supply_kind, employee : mode?
- career_end/start : drop?

In [120]:
# 컬럼제거
drop_col = ["address_seq2", "address_seq3", "text_keyword", "career_end", "career_start"]
merged_recruitment.drop(columns = drop_col, inplace = True)

In [121]:
addr_cnt = merged_recruitment["address_seq1"].value_counts()
addr_cnt

3.0     6490
5.0      174
20.0      17
11.0       5
1.0        4
13.0       2
2.0        1
9.0        1
Name: address_seq1, dtype: int64

In [122]:
com_type_cnt = merged_recruitment["company_type_seq"].value_counts()
com_type_cnt

2.0    1658
4.0     410
5.0     161
9.0     109
7.0      35
3.0       4
Name: company_type_seq, dtype: int64

In [123]:
supply_kind_cnt = merged_recruitment["supply_kind"].value_counts()
supply_kind_cnt

402.0    1606
100.0     463
514.0     126
201.0     101
508.0      33
501.0      20
515.0       6
601.0       5
409.0       5
507.0       4
413.0       2
502.0       1
513.0       1
605.0       1
603.0       1
512.0       1
412.0       1
Name: supply_kind, dtype: int64

In [124]:
employee_cnt = merged_recruitment["employee"].value_counts(normalize=True)
employee_cnt

90.0     0.080774
20.0     0.074884
250.0    0.068153
150.0    0.062684
590.0    0.059739
           ...   
280.0    0.000421
23.0     0.000421
18.0     0.000421
45.0     0.000421
11.0     0.000421
Name: employee, Length: 81, dtype: float64

In [125]:
addr_mode = merged_recruitment["address_seq1"].mode().values[0]
company_type_mode = merged_recruitment["company_type_seq"].mode().values[0]
supply_kind_mode = merged_recruitment["supply_kind"].mode().values[0]
employee_median = merged_recruitment["employee"].median()

In [126]:
merged_recruitment["address_seq1"].fillna(addr_mode, inplace = True)
merged_recruitment["company_type_seq"].fillna(company_type_mode, inplace = True)
merged_recruitment["supply_kind"].fillna(supply_kind_mode, inplace = True)
merged_recruitment["employee"].fillna(employee_median, inplace = True)

In [127]:
(merged_recruitment.isnull().sum() / len(merged_recruitment)).sort_values()

address_seq1         0.0
check_box_keyword    0.0
education            0.0
major_task           0.0
qualifications       0.0
company_type_seq     0.0
supply_kind          0.0
employee             0.0
dtype: float64

- merged_resume 내용 확인

In [128]:
merged_resume.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8482 entries, U00001 to U08482
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   reg_date               8482 non-null   object 
 1   updated_date           8482 non-null   object 
 2   degree                 8482 non-null   int64  
 3   graduate_date          8482 non-null   int64  
 4   hope_salary            8482 non-null   float64
 5   last_salary            8482 non-null   float64
 6   text_keyword           8287 non-null   object 
 7   job_code_seq1          8482 non-null   object 
 8   job_code_seq2          500 non-null    object 
 9   job_code_seq3          187 non-null    object 
 10  career_month           8482 non-null   int64  
 11  career_job_code        7724 non-null   object 
 12  certificate_contents   5976 non-null   object 
 13  hischool_type_seq      8482 non-null   int64  
 14  hischool_special_type  8482 non-null   object 
 15  hi

In [129]:
merged_resume.describe()

Unnamed: 0,degree,graduate_date,hope_salary,last_salary,career_month,hischool_type_seq,hischool_location_seq,univ_type_seq1,univ_type_seq2,univ_transfer,univ_location,univ_major_type,univ_score
count,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0,8482.0
mean,3.938222,1819.586065,1395.095496,2505.706201,69.408984,16.402499,5.738387,4.204551,4.684626,0.00448,6.651733,8.095143,71.38035
std,0.440764,584.009224,1865.951253,2010.082733,63.547948,8.485975,4.886454,1.608187,1.89288,0.066787,5.272027,5.152574,20.418688
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,2001.0,0.0,0.0,18.0,20.0,3.0,5.0,5.0,0.0,3.0,4.0,70.0
50%,4.0,2007.0,0.0,2700.0,53.0,21.0,4.0,5.0,5.0,0.0,5.0,9.0,80.0
75%,4.0,2011.0,2700.0,3700.0,108.0,21.0,9.0,5.0,5.0,0.0,10.0,9.0,80.0
max,6.0,2022.0,10000.0,10000.0,432.0,23.0,18.0,6.0,12.0,1.0,19.0,23.0,100.0


In [130]:
merged_resume.columns

Index(['reg_date', 'updated_date', 'degree', 'graduate_date', 'hope_salary',
       'last_salary', 'text_keyword', 'job_code_seq1', 'job_code_seq2',
       'job_code_seq3', 'career_month', 'career_job_code',
       'certificate_contents', 'hischool_type_seq', 'hischool_special_type',
       'hischool_nation', 'hischool_gender', 'hischool_location_seq',
       'univ_type_seq1', 'univ_type_seq2', 'univ_transfer', 'univ_location',
       'univ_major', 'univ_sub_major', 'univ_major_type', 'univ_score',
       'lang_exam_score'],
      dtype='object')

- degree : 범주
- updated_date : 날짜
- hope_salary : 수치
- last_salary : 수치
- career_month : 수치
- hischool_type_seq : 범주?
- hischool_location_seq : 범주
- univ_type_seq1 : 범주
- univ_type_seq2 : 범주
- univ_transfer : ?
- univ_location :범주
- univ_major_type : 범주
- univ_score : 수치

In [131]:
(merged_resume.isnull().sum() / len(merged_resume)).sort_values()

reg_date                 0.000000
univ_major_type          0.000000
univ_location            0.000000
univ_transfer            0.000000
univ_type_seq2           0.000000
univ_type_seq1           0.000000
hischool_location_seq    0.000000
hischool_gender          0.000000
hischool_nation          0.000000
hischool_special_type    0.000000
univ_score               0.000000
hischool_type_seq        0.000000
career_month             0.000000
job_code_seq1            0.000000
last_salary              0.000000
hope_salary              0.000000
graduate_date            0.000000
degree                   0.000000
updated_date             0.000000
text_keyword             0.022990
career_job_code          0.089366
certificate_contents     0.295449
univ_major               0.788375
lang_exam_score          0.903325
job_code_seq2            0.941052
univ_sub_major           0.950955
job_code_seq3            0.977953
dtype: float64

- job_code_seq3, univ_sub_major, job_code_seq2, lang_exam_score = drop
- text_keyword, career_job_code, certificate_contents, univ_major : 확률 확인

In [134]:
drop_col = ["job_code_seq3", "univ_sub_major", "job_code_seq2", "lang_exam_score" ]
merged_resume.drop(columns = drop_col, inplace = True)

In [135]:
(merged_resume.isnull().sum() / len(merged_resume)).sort_values()

reg_date                 0.000000
univ_location            0.000000
univ_transfer            0.000000
univ_type_seq2           0.000000
univ_type_seq1           0.000000
hischool_location_seq    0.000000
hischool_gender          0.000000
hischool_nation          0.000000
hischool_special_type    0.000000
univ_major_type          0.000000
hischool_type_seq        0.000000
career_month             0.000000
job_code_seq1            0.000000
last_salary              0.000000
hope_salary              0.000000
graduate_date            0.000000
degree                   0.000000
updated_date             0.000000
univ_score               0.000000
text_keyword             0.022990
career_job_code          0.089366
certificate_contents     0.295449
univ_major               0.788375
dtype: float64

In [136]:
merged_resume["text_keyword"]

resume_seq
U00001                              디자이너
U00002                              디자이너
U00003                     남성복디자이너;TD캐주얼
U00004                         상품기획;영업기획
U00005                  인사;총무;경영;MD;상품기획
                       ...              
U08478                              상품기획
U08479                              디자이너
U08480    VMD;텍스타일디자인;섬유디자인;니트디자인;여성복디자인
U08481                   상품기획;머천다이저;기획MD
U08482       상품기획;영업MD;머천다이저;기획MD;마케팅;PR
Name: text_keyword, Length: 8482, dtype: object

In [137]:
merged_resume["career_job_code"]

resume_seq
U00001            NaN
U00002    기타 공공;개인서비스
U00003    섬유;봉제;가방;의류
U00004    섬유;봉제;가방;의류
U00005            NaN
             ...     
U08478    섬유;봉제;가방;의류
U08479     디자인;CAD;설계
U08480            NaN
U08481    섬유;봉제;가방;의류
U08482            NaN
Name: career_job_code, Length: 8482, dtype: object

In [138]:
merged_resume["certificate_contents"]

resume_seq
U00001                                               NaN
U00002                                  자동차 운전면허 (2종 보통)
U00003                       운전면허2종보통;컬러리스트 기사;워드프로세서 1급
U00004            정보처리기능사;유통관리사 2급;MOS EXCEL 2003 EXPERT
U00005                                               NaN
                               ...                      
U08478                                               NaN
U08479                                      컴퓨터그래픽스운용기능사
U08480    2종 보통 운전면허증;컴퓨터그래픽스운용기능사;컬러리스트기사;컴퓨터패션디자인운용마스터
U08481                                        컴퓨터활용능력 2급
U08482                        컴퓨터그래픽스운용기능사;패션 머천다이징산업 기사
Name: certificate_contents, Length: 8482, dtype: object

In [139]:
merged_resume["univ_major"]

resume_seq
U00001                NaN
U00002                NaN
U00003                NaN
U00004                NaN
U00005                NaN
               ...       
U08478                NaN
U08479                가정과
U08480    미술대학섬유패션코디네이션학과
U08481                NaN
U08482                NaN
Name: univ_major, Length: 8482, dtype: object

In [None]:
#학습, 검증 분리
resume_train, resume_test = [], []
apply_train_groupby = apply_train.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        resume_train.append([uid,iid])
    resume_test.append([uid, iids[-1]])

In [None]:
apply_tr = pd.DataFrame(resume_train, columns=['resume_seq', 'recruitment_seq'])
apply_te = pd.DataFrame(resume_test, columns=['resume_seq', 'recruitment_seq'])

In [None]:
#학습, 검증 분리
train, val = [], []
apply_train_groupby = apply_train_df.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train.append([uid,iid])
    val.append([uid, iids[-1]])

In [None]:
train = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])
val = pd.DataFrame(val, columns=['resume_seq', 'recruitment_seq'])
pred = apply_train_df.copy()

In [None]:
train_user_item_matrix = train.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
pred_user_item_matrix = pred.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

In [None]:
train_user_similarity = cosine_similarity(train_user_item_matrix)
train_item_similarity = cosine_similarity(train_user_item_matrix.T)

pred_user_similarity = cosine_similarity(pred_user_item_matrix)
pred_item_similarity = cosine_similarity(pred_user_item_matrix.T)

In [None]:
train_user_predicted_scores = train_user_similarity.dot(train_user_item_matrix)
train_item_predicted_scores = train_user_item_matrix.dot(train_item_similarity)

pred_user_predicted_scores = pred_user_similarity.dot(pred_user_item_matrix)
pred_item_predicted_scores = pred_user_item_matrix.dot(pred_item_similarity)

In [None]:
alpha = 0.98
train_recommendations = []
for idx, user in tqdm(enumerate(train_user_item_matrix.index)):
    applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)

    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx]).argsort()[::-1]
    recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

    for job in recommended_jobs:
        train_recommendations.append([user, job])

In [None]:
val_prediction = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])

In [None]:
recall5(val,val_prediction)

In [None]:
recall5(val,val_prediction)

In [None]:
alpha = 0.98
pred_recommendations = []
for idx, user in tqdm(enumerate(pred_user_item_matrix.index)):
    applied_jobs = set(pred_user_item_matrix.loc[user][pred_user_item_matrix.loc[user] == 1].index)

    sorted_job_indices = (pred_item_predicted_scores.loc[user].values * alpha + pred_user_predicted_scores[idx]).argsort()[::-1]
    recommended_jobs = [job for job in pred_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

    for job in recommended_jobs:
        pred_recommendations.append([user, job])

# Submission

In [None]:
top_recommendations = pd.DataFrame(pred_recommendations, columns=['resume_seq', 'recruitment_seq'])
top_recommendations.to_csv(args.submission_path, index=False)

# 데이터 전처리 (train data, test data)

## EDA (train data)

## 결측치 제거

## 신규컬럼 생성

## Encoding

## Scaling

# 앙상블 Modeling (train_f, train_t)
- shape, 결측치 확인 필수

## 데이터 분리 (train data)

## Experiment

- 필요한 라이브러리 import

### Cross Validation

### Model

### HPO

## Best Model 해석

### Cross Validation

### Best Model 생성 및 학습

### Confusion Matrix

### HeatMap by Confusion Matrix

# 예측

# 평가

# Result 저장 (Submission)
- Dictionary List로 sort해서 best model select