In [224]:
import lightgbm
import glob, os
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

------
## 1. 전체 석사 졸업 예측

In [226]:
total_ms_list = glob.glob(os.path.join(os.getcwd(), 'original_data', '*ms.csv'))
print(total_ms_list)

['f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대\\original_data\\std_info_grad_ms.csv', 'f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대\\original_data\\std_info_ms.csv']


In [227]:
ms_grad = pd.read_csv(total_ms_list[0])
ms_undergrad = pd.read_csv(total_ms_list[1])


In [228]:
'''
ms_grad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec014_nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'rec014_school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']
'''
ms_grad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',        #학습 위해서 column name(rec014~) 변경
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']

grad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
grad_num_features = ['gpa', '서류점수', '면접점수', '인건비총액']       #연속형 데이터
grad_term = ['school_term']

for i in grad_cat_features:
    ms_grad[i] = ms_grad[i].astype('category')

### 졸업자 정보에 대하여 Stratified 5-Fold 학습


In [229]:
model = lightgbm.LGBMClassifier()
skfold = StratifiedKFold(n_splits=5)

cv_accuracy = []

X_grad = ms_grad[grad_cat_features + grad_num_features]
y_grad = ms_grad[grad_term]

iter = 0
for train_idx, test_idx in skfold.split(X_grad, y_grad):
    # print(X_grad.iloc[train_idx])
    # print(y_grad.iloc[test_idx])
    model.fit(X_grad.iloc[train_idx], y_grad.iloc[train_idx].values.ravel())
    pred = model.predict(X_grad.iloc[test_idx])
    cv_accuracy.append(accuracy_score(y_grad.iloc[test_idx], pred))
    print(f"Fold {iter}:\n",classification_report(y_grad.iloc[test_idx], pred, zero_division=0))
    iter += 1

print("masters cross validation result : ", cv_accuracy)

Fold 0:
               precision    recall  f1-score   support

           2       0.00      0.00      0.00         4
           3       1.00      0.75      0.86         8
           4       0.97      1.00      0.98       925
           5       0.50      0.05      0.10        19
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1

    accuracy                           0.97       961
   macro avg       0.41      0.30      0.32       961
weighted avg       0.95      0.97      0.96       961

Fold 1:
               precision    recall  f1-score   support

           2       1.00      0.67      0.80         3
           3       1.00      0.88      0.93         8
           4       0.97      1.00      0.99       925
           5       0.00      0.00      0.00        19
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1

    accuracy                           0.97       960
   ma

### 재학자에 대한 예측

In [230]:
'''
ms_undergrad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec012_nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'rec012_school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']
'''
ms_undergrad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']

undergrad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
undergrad_num_features = ['gpa', '서류점수', '면접점수', '인건비총액']       #연속형 데이터
undergrad_term = ['school_term']

for i in undergrad_cat_features:
    ms_undergrad[i] = ms_undergrad[i].astype('category')

X_undergrad = ms_undergrad[undergrad_cat_features + undergrad_num_features]

y_undergrad = pd.DataFrame(model.predict(X_undergrad))

In [231]:
ms_undergrad['predicted_school_term'] = y_undergrad
ms_undergrad.to_csv(os.path.join(os.getcwd(), 'prediction_result', 'original_data', 'std_info_ms_predicted.csv'))

------
## 2. 전체 박사 졸업 예측

In [232]:
total_phd_list = glob.glob(os.path.join(os.getcwd(), 'original_data', '*phd.csv'))
print(total_phd_list)

['f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대\\original_data\\std_info_grad_phd.csv', 'f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대\\original_data\\std_info_phd.csv']


In [233]:
phd_grad = pd.read_csv(total_phd_list[0])
phd_undergrad = pd.read_csv(total_phd_list[1])


In [234]:
'''
phd_grad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec014_nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'rec014_school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']
'''
phd_grad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',        #학습 위해서 column name(rec014~) 변경
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']

grad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
grad_num_features = ['gpa', '서류점수', '면접점수', '인건비총액']       #연속형 데이터
grad_term = ['school_term']

for i in grad_cat_features:
    phd_grad[i] = phd_grad[i].astype('category')

### 졸업자 정보에 대하여 Stratified 5-Fold 학습


In [235]:
model = lightgbm.LGBMClassifier()
skfold = StratifiedKFold(n_splits=5)

cv_accuracy = []

X_grad = phd_grad[grad_cat_features + grad_num_features]
y_grad = phd_grad[grad_term]

iter = 0
for train_idx, test_idx in skfold.split(X_grad, y_grad):
    # print(X_grad.iloc[train_idx])
    # print(y_grad.iloc[test_idx])
    model.fit(X_grad.iloc[train_idx], y_grad.iloc[train_idx].values.ravel())
    pred = model.predict(X_grad.iloc[test_idx])
    cv_accuracy.append(accuracy_score(y_grad.iloc[test_idx], pred))
    print(f"Fold {iter}:\n",classification_report(y_grad.iloc[test_idx], pred, zero_division=0))
    iter += 1

print("masters cross validation result : ", cv_accuracy)

Fold 0:
               precision    recall  f1-score   support

           4       0.93      0.99      0.96       112
           5       0.00      0.00      0.00         7
           6       0.00      0.00      0.00         1

    accuracy                           0.93       120
   macro avg       0.31      0.33      0.32       120
weighted avg       0.87      0.93      0.90       120

Fold 1:
               precision    recall  f1-score   support

           4       0.93      1.00      0.97       112
           5       0.00      0.00      0.00         7
           6       0.00      0.00      0.00         1

    accuracy                           0.93       120
   macro avg       0.31      0.33      0.32       120
weighted avg       0.87      0.93      0.90       120

Fold 2:
               precision    recall  f1-score   support

           4       0.94      1.00      0.97       112
           5       1.00      0.14      0.25         7
           6       0.00      0.00      0.00     

### 재학자에 대한 예측

In [236]:
'''
phd_undergrad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec012_nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'rec012_school_term',
       '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']
'''
phd_undergrad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'school_term',
       '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']

undergrad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
undergrad_num_features = ['gpa', '서류점수', '면접점수', '인건비총액']       #연속형 데이터
undergrad_term = ['school_term']

for i in undergrad_cat_features:
    phd_undergrad[i] = phd_undergrad[i].astype('category')

X_undergrad = phd_undergrad[undergrad_cat_features + undergrad_num_features]

y_undergrad = pd.DataFrame(model.predict(X_undergrad))

In [237]:
phd_undergrad['predicted_school_term'] = y_undergrad
phd_undergrad.to_csv(os.path.join(os.getcwd(), 'prediction_result', 'original_data', 'std_info_phd_predicted.csv'))

------
## 3. 전체 석박통합 졸업 예측

In [238]:
total_combined_list = glob.glob(os.path.join(os.getcwd(), 'original_data', '*combined.csv'))
print(total_combined_list)

['f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대\\original_data\\std_info_combined.csv', 'f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대\\original_data\\std_info_grad_combined.csv']


In [239]:
combined_grad = pd.read_csv(total_combined_list[1])
combined_undergrad = pd.read_csv(total_combined_list[0])


In [240]:
'''
combined_grad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec014_nation_cd',
       '과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
       'rec014_school_term', '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']
'''
combined_grad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',              #학습 위해서 column name(rec014~) 변경
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
       'school_term', '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']

grad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
grad_num_features = ['gpa', '서류점수', '면접점수', '인건비총액']       #연속형 데이터
grad_term = ['school_term']

for i in grad_cat_features:
    combined_grad[i] = combined_grad[i].astype('category')

### 졸업자 정보에 대하여 Stratified 5-Fold 학습


In [241]:
model = lightgbm.LGBMClassifier()
skfold = StratifiedKFold(n_splits=5)

cv_accuracy = []

X_grad = combined_grad[grad_cat_features + grad_num_features]
y_grad = combined_grad[grad_term]

iter = 0
for train_idx, test_idx in skfold.split(X_grad, y_grad):
    # print(X_grad.iloc[train_idx])
    # print(y_grad.iloc[test_idx])
    model.fit(X_grad.iloc[train_idx], y_grad.iloc[train_idx].values.ravel())
    pred = model.predict(X_grad.iloc[test_idx])
    cv_accuracy.append(accuracy_score(y_grad.iloc[test_idx], pred))
    print(f"Fold {iter}:\n",classification_report(y_grad.iloc[test_idx], pred, zero_division=0))
    iter += 1

print("masters cross validation result : ", cv_accuracy)



Fold 0:
               precision    recall  f1-score   support

           6       0.79      0.94      0.86        32
           7       0.33      0.20      0.25         5
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         1

    accuracy                           0.76        41
   macro avg       0.28      0.28      0.28        41
weighted avg       0.66      0.76      0.70        41

Fold 1:
               precision    recall  f1-score   support

           6       0.89      1.00      0.94        32
           7       1.00      0.20      0.33         5
           8       1.00      1.00      1.00         3

    accuracy                           0.90        40
   macro avg       0.96      0.73      0.76        40
weighted avg       0.91      0.90      0.87        40

Fold 2:
               precision    recall  f1-score   support

           6       0.88      0.91      0.89        32
           7       0.25      0.20      0.22     

### 재학자에 대한 예측

In [243]:
'''
combined_undergrad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec012_nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'rec012_school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']
'''
combined_undergrad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']

undergrad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
undergrad_num_features = ['gpa', '서류점수', '면접점수', '인건비총액']       #연속형 데이터
undergrad_term = ['school_term']

for i in undergrad_cat_features:
    combined_undergrad[i] = combined_undergrad[i].astype('category')

X_undergrad = combined_undergrad[undergrad_cat_features + undergrad_num_features]

y_undergrad = pd.DataFrame(model.predict(X_undergrad))

In [244]:
combined_undergrad['predicted_school_term'] = y_undergrad
combined_undergrad.to_csv(os.path.join(os.getcwd(), 'prediction_result', 'original_data', 'std_info_combined_predicted.csv'))

------
------
## 1. 공학계열 석사 졸업 예측

In [245]:
total_ms_list = glob.glob(os.path.join(os.getcwd(), 'eng_data', '*ms_eng.csv'))
print(total_ms_list)

['f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대\\eng_data\\std_info_grad_ms_eng.csv', 'f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대\\eng_data\\std_info_ms_eng.csv']


In [246]:
ms_grad = pd.read_csv(total_ms_list[0])
ms_undergrad = pd.read_csv(total_ms_list[1])


In [247]:
'''
ms_grad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec014_nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'rec014_school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']
'''
ms_grad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',        #학습 위해서 column name(rec014~) 변경
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']

grad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
grad_num_features = ['gpa', '서류점수', '면접점수', '인건비총액']       #연속형 데이터
grad_term = ['school_term']

for i in grad_cat_features:
    ms_grad[i] = ms_grad[i].astype('category')

### 졸업자 정보에 대하여 Stratified 5-Fold 학습


In [248]:
model = lightgbm.LGBMClassifier()
skfold = StratifiedKFold(n_splits=5)

cv_accuracy = []

X_grad = ms_grad[grad_cat_features + grad_num_features]
y_grad = ms_grad[grad_term]

iter = 0
for train_idx, test_idx in skfold.split(X_grad, y_grad):
    # print(X_grad.iloc[train_idx])
    # print(y_grad.iloc[test_idx])
    model.fit(X_grad.iloc[train_idx], y_grad.iloc[train_idx].values.ravel())
    pred = model.predict(X_grad.iloc[test_idx])
    cv_accuracy.append(accuracy_score(y_grad.iloc[test_idx], pred))
    print(f"Fold {iter}:\n",classification_report(y_grad.iloc[test_idx], pred, zero_division=0))
    iter += 1

print("masters cross validation result : ", cv_accuracy)



Fold 0:
               precision    recall  f1-score   support

           4       0.98      0.99      0.98       289
           5       0.00      0.00      0.00         6
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1

    accuracy                           0.97       297
   macro avg       0.24      0.25      0.25       297
weighted avg       0.95      0.97      0.96       297

Fold 1:
               precision    recall  f1-score   support

           4       0.98      1.00      0.99       289
           5       0.00      0.00      0.00         6
           6       1.00      1.00      1.00         1
           7       0.00      0.00      0.00         1

    accuracy                           0.98       297
   macro avg       0.49      0.50      0.50       297
weighted avg       0.95      0.98      0.96       297

Fold 2:
               precision    recall  f1-score   support

           3       0.00      0.00      0.00     

### 재학자에 대한 예측

In [249]:
'''
ms_undergrad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec012_nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'rec012_school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']
'''
ms_undergrad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']

undergrad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
undergrad_num_features = ['gpa', '서류점수', '면접점수', '인건비총액']       #연속형 데이터
undergrad_term = ['school_term']

for i in undergrad_cat_features:
    ms_undergrad[i] = ms_undergrad[i].astype('category')

X_undergrad = ms_undergrad[undergrad_cat_features + undergrad_num_features]

y_undergrad = pd.DataFrame(model.predict(X_undergrad))

In [250]:
ms_undergrad['predicted_school_term'] = y_undergrad
ms_undergrad.to_csv(os.path.join(os.getcwd(), 'prediction_result', 'eng_data', 'std_info_ms_eng_predicted.csv'))

------
## 2. 공학계열 박사 졸업 예측

In [251]:
total_phd_list = glob.glob(os.path.join(os.getcwd(), 'eng_data', '*phd_eng.csv'))
print(total_phd_list)

['f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대\\eng_data\\std_info_grad_phd_eng.csv', 'f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대\\eng_data\\std_info_phd_eng.csv']


In [252]:
phd_grad = pd.read_csv(total_phd_list[0])
phd_undergrad = pd.read_csv(total_phd_list[1])


In [253]:
'''
phd_grad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec014_nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'rec014_school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']
'''
phd_grad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',        #학습 위해서 column name(rec014~) 변경
'학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
'과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
'school_term', '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']

grad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
grad_num_features = ['gpa', '서류점수', '면접점수', '인건비총액']       #연속형 데이터
grad_term = ['school_term']

for i in grad_cat_features:
    phd_grad[i] = phd_grad[i].astype('category')

### 졸업자 정보에 대하여 Stratified 5-Fold 학습


In [254]:
model = lightgbm.LGBMClassifier()
skfold = StratifiedKFold(n_splits=5)

cv_accuracy = []

X_grad = phd_grad[grad_cat_features + grad_num_features]
y_grad = phd_grad[grad_term]

iter = 0
for train_idx, test_idx in skfold.split(X_grad, y_grad):
    # print(X_grad.iloc[train_idx])
    # print(y_grad.iloc[test_idx])
    model.fit(X_grad.iloc[train_idx], y_grad.iloc[train_idx].values.ravel())
    pred = model.predict(X_grad.iloc[test_idx])
    cv_accuracy.append(accuracy_score(y_grad.iloc[test_idx], pred))
    print(f"Fold {iter}:\n",classification_report(y_grad.iloc[test_idx], pred, zero_division=0))
    iter += 1

print("masters cross validation result : ", cv_accuracy)

Fold 0:
               precision    recall  f1-score   support

           4       0.90      1.00      0.95        18
           5       0.00      0.00      0.00         2

    accuracy                           0.90        20
   macro avg       0.45      0.50      0.47        20
weighted avg       0.81      0.90      0.85        20

Fold 1:
               precision    recall  f1-score   support

           4       0.90      1.00      0.95        18
           5       0.00      0.00      0.00         2

    accuracy                           0.90        20
   macro avg       0.45      0.50      0.47        20
weighted avg       0.81      0.90      0.85        20

Fold 2:
               precision    recall  f1-score   support

           4       0.95      1.00      0.97        18
           5       0.00      0.00      0.00         1

    accuracy                           0.95        19
   macro avg       0.47      0.50      0.49        19
weighted avg       0.90      0.95      0.92    

### 재학자에 대한 예측

In [255]:
'''
phd_undergrad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec012_nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'rec012_school_term',
       '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']
'''
phd_undergrad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'school_term',
       '서류점수', '면접점수', '학부출신', '대학원출신', '인건비총액']

undergrad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
undergrad_num_features = ['gpa', '서류점수', '면접점수', '인건비총액']       #연속형 데이터
undergrad_term = ['school_term']

for i in undergrad_cat_features:
    phd_undergrad[i] = phd_undergrad[i].astype('category')

X_undergrad = phd_undergrad[undergrad_cat_features + undergrad_num_features]

y_undergrad = pd.DataFrame(model.predict(X_undergrad))

In [256]:
phd_undergrad['predicted_school_term'] = y_undergrad
phd_undergrad.to_csv(os.path.join(os.getcwd(), 'prediction_result', 'eng_data', 'std_info_phd_eng_predicted.csv'))

------
## 3. 공학계열 석박통합 졸업 예측

In [257]:
total_combined_list = glob.glob(os.path.join(os.getcwd(), 'eng_data', '*combined_eng.csv'))
print(total_combined_list)

['f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대\\eng_data\\std_info_combined_eng.csv', 'f:\\OneDrive - 고려대학교\\내 드라이브\\고려대학교\\대학원\\장학금\\BK21 4단계 대학원 혁신 펠로우쉽 연구과제\\내외국인_공대\\eng_data\\std_info_grad_combined_eng.csv']


In [258]:
combined_grad = pd.read_csv(total_combined_list[1])
combined_undergrad = pd.read_csv(total_combined_list[0])


In [259]:
'''
combined_grad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec014_nation_cd',
       '과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
       'rec014_school_term', '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']
'''
combined_grad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',              #학습 위해서 column name(rec014~) 변경
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec014_birth_dt', '졸업년도', '졸업학기', 'gpa',
       'school_term', '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']

grad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
grad_num_features = ['gpa', '서류점수', '면접점수', '인건비총액']       #연속형 데이터
grad_term = ['school_term']

for i in grad_cat_features:
    combined_grad[i] = combined_grad[i].astype('category')

### 졸업자 정보에 대하여 Stratified 5-Fold 학습


In [260]:
model = lightgbm.LGBMClassifier()
skfold = StratifiedKFold(n_splits=5)

cv_accuracy = []

X_grad = combined_grad[grad_cat_features + grad_num_features]
y_grad = combined_grad[grad_term]

iter = 0
for train_idx, test_idx in skfold.split(X_grad, y_grad):
    # print(X_grad.iloc[train_idx])
    # print(y_grad.iloc[test_idx])
    model.fit(X_grad.iloc[train_idx], y_grad.iloc[train_idx].values.ravel())
    pred = model.predict(X_grad.iloc[test_idx])
    cv_accuracy.append(accuracy_score(y_grad.iloc[test_idx], pred))
    print(f"Fold {iter}:\n",classification_report(y_grad.iloc[test_idx], pred, zero_division=0))
    iter += 1

print("masters cross validation result : ", cv_accuracy)



Fold 0:
               precision    recall  f1-score   support

           6       0.89      1.00      0.94        16
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1

    accuracy                           0.89        18
   macro avg       0.30      0.33      0.31        18
weighted avg       0.79      0.89      0.84        18

Fold 1:
               precision    recall  f1-score   support

           6       0.89      1.00      0.94        16
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1

    accuracy                           0.89        18
   macro avg       0.30      0.33      0.31        18
weighted avg       0.79      0.89      0.84        18

Fold 2:
               precision    recall  f1-score   support

           6       0.94      1.00      0.97        16
           7       0.00      0.00      0.00         1
           8       1.00      1.00      1.00     

### 재학자에 대한 예측

In [261]:
'''
combined_undergrad.columns = 
['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'rec012_nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'rec012_school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']
'''
combined_undergrad.columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', '학번', '학생소속대학명',
       '학생교육부계열', '성별', '입학과정', '학적', '학과', '전공', '전공교수', 'nation_cd',
       '과정', '수료년도', '수료학기', 'rec012_birth_dt', 'gpa', 'school_term',
       '인건비총액', '서류점수', '면접점수', '학부출신', '대학원출신']

undergrad_cat_features = ['학생소속대학명', '학생교육부계열', '성별', '입학과정', '학과', 'nation_cd', '학부출신']    #범주형 데이터
undergrad_num_features = ['gpa', '서류점수', '면접점수', '인건비총액']       #연속형 데이터
undergrad_term = ['school_term']

for i in undergrad_cat_features:
    combined_undergrad[i] = combined_undergrad[i].astype('category')

X_undergrad = combined_undergrad[undergrad_cat_features + undergrad_num_features]

y_undergrad = pd.DataFrame(model.predict(X_undergrad))

In [262]:
combined_undergrad['predicted_school_term'] = y_undergrad
combined_undergrad.to_csv(os.path.join(os.getcwd(), 'prediction_result', 'eng_data', 'std_info_combined_eng_predicted.csv'))