## 초등학교 merge

In [None]:
import pandas as pd

element_student = pd.read_csv('학생기초정보데이터_초등학교.csv', encoding = 'utf-8')
element_professional = pd.read_csv('전문가라벨링데이터_초등학교.csv', encoding = 'utf-8')
element_text = pd.read_csv('상담기록데이터_초등학교.csv', encoding = 'utf-8')

display(element_professional.head())
display(element_student.head())
display(element_text.head())


In [None]:
element_temp = pd.merge( element_professional, element_student, left_on = 'student_idx', right_on='index')
merge_element_data = pd.merge(element_temp, element_text, on = 'student_idx')

pd.set_option('display.max_columns', None)
display(merge_element_data.head(2))
print(merge_element_data.shape)

In [None]:
drop_cols = ['job_label', 'expert_comment_ko', 'index', 'school_type', 'region', 'gender', 'grade', 'counselling_purpose_x','counseling_idx_1', 'counselling_purpose_y', 'counselling_satisfaction', 'counselling_date']
merge_element_data = merge_element_data.drop(drop_cols, axis = 1)

In [None]:
display(merge_element_data.head(2))
print(merge_element_data.shape)

## 중학교 merge

In [None]:
import pandas as pd

middle_student = pd.read_csv('학생기초정보데이터_중학교.csv', encoding = 'utf-8')
middle_professional = pd.read_csv('전문가라벨링데이터_중학교.csv', encoding = 'utf-8')
middle_text = pd.read_csv('상담기록데이터_중학교.csv', encoding = 'utf-8')

In [None]:
middle_temp = pd.merge( middle_professional, middle_student, left_on = 'student_idx', right_on='index')
merge_middle_data = pd.merge(middle_temp, middle_text, on = 'student_idx')

pd.set_option('display.max_columns', None)
display(merge_middle_data.head(2))
print(merge_middle_data.shape)

In [None]:
drop_cols = ['job_label', 'expert_comment_ko', 'index', 'school_type', 'region', 'gender', 'grade', 'counselling_purpose_x','counseling_idx_1', 'counselling_purpose_y', 'counselling_satisfaction', 'counselling_date']
merge_middle_data = merge_middle_data.drop(drop_cols, axis = 1)

In [None]:
display(merge_middle_data.head(2))
print(merge_middle_data.shape)

## 고등학교

In [None]:
import pandas as pd

high_student = pd.read_csv('학생기초정보데이터_고등학교.csv', encoding = 'utf-8')
high_professional = pd.read_csv('전문가라벨링데이터_고등학교.csv', encoding = 'utf-8')
high_text = pd.read_csv('상담기록데이터_고등학교.csv', encoding = 'utf-8')

In [None]:
high_temp = pd.merge(high_professional, high_student, left_on = 'student_idx', right_on='index')
merge_high_data = pd.merge(high_temp, high_text, on = 'student_idx')

pd.set_option('display.max_columns', None)
display(merge_high_data.head(2))
print(merge_high_data.shape)

In [None]:
drop_cols = ['job_label', 'expert_comment_ko', 'index', 'school_type', 'region', 'gender', 'grade', 'counselling_purpose_x','counseling_idx_1', 'counselling_purpose_y', 'counselling_satisfaction', 'counselling_date']
merge_high_data = merge_high_data.drop(drop_cols, axis = 1)

In [None]:
display(merge_high_data.head(2))
print(merge_high_data.shape)

In [None]:
print(merge_element_data.shape)
print(merge_middle_data.shape)
print(merge_high_data.shape)

In [None]:
# DataFrame 결합
merged_all_data = pd.concat([merge_element_data, merge_middle_data, merge_high_data], ignore_index=True)

# 결합된 DataFrame의 크기 출력
print(merged_all_data.shape)  # (5200, 14)

# 결합된 DataFrame 출력
display(merged_all_data)

In [None]:
merged_all_data.to_csv('merge_모든데이터.csv', index=False, encoding='utf-8-sig')

## 데이터 전처리

In [None]:
merged_all_data.isnull().sum()

In [None]:
data = merged_all_data[['combined_conversation', 'priority1', 'priority2', 'priority3']]

# 텍스트 데이터 전처리
data['combined_conversation'] = data['combined_conversation'].str.replace('[^\w\s]', '')

# 타깃 변수 설정
target_columns = ['priority1', 'priority2', 'priority3']

## 텍스트 벡터화(KoNLPy의 Okt 형태소 분석기)

In [None]:
!pip install konlpy

In [None]:
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer

okt = Okt()

# 텍스트를 형태소 단위로 토큰화
def okt_tokenizer(text):
    return okt.morphs(text)

# TF-IDF 벡터화
vectorizer = TfidfVectorizer(tokenizer=okt_tokenizer, max_features=10000)
X_tfidf = vectorizer.fit_transform(data['combined_conversation'])


## 데이터 분할 및 학습

In [None]:
data['priority1'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 데이터 분할
X = X_tfidf
y = data['priority1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 로지스틱 회귀 모델 학습
model = LogisticRegression()
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 평가
print("Classification Report for priority1:")
print(classification_report(y_test, y_pred))


## --------------TEST--------------

## 초등학교 merge

In [None]:
import pandas as pd

test_element_student = pd.read_csv('test_학생기초정보데이터_초등학교.csv', encoding = 'utf-8')
test_element_professional = pd.read_csv('test_전문가라벨링데이터_초등학교.csv', encoding = 'utf-8')
test_element_text = pd.read_csv('test_상담기록데이터_초등학교.csv', encoding = 'utf-8')

display(test_element_student.head())
display(test_element_professional.head())
display(test_element_text.head())


In [None]:
test_element_temp = pd.merge(test_element_professional, test_element_student, left_on = 'student_idx', right_on='index')
test_merge_element_data = pd.merge(test_element_temp, test_element_text, on = 'student_idx')

pd.set_option('display.max_columns', None)
display(test_merge_element_data.head(2))
print(test_merge_element_data.shape)

In [None]:
drop_cols = ['job_label', 'expert_comment_ko', 'index', 'school_type', 'region', 'gender', 'grade', 'counselling_purpose_x','counseling_idx_1', 'counselling_purpose_y', 'counselling_satisfaction', 'counselling_date']
test_merge_element_data = test_merge_element_data.drop(drop_cols, axis = 1)

In [None]:
display(test_merge_element_data.head(2))
print(test_merge_element_data.shape)

## 중학교 merge

In [None]:
import pandas as pd

test_middle_student = pd.read_csv('test_학생기초정보데이터_중학교.csv', encoding = 'utf-8')
test_middle_professional = pd.read_csv('test_전문가라벨링데이터_중학교.csv', encoding = 'utf-8')
test_middle_text = pd.read_csv('test_상담기록데이터_중학교.csv', encoding = 'utf-8')

In [None]:
test_middle_temp = pd.merge( test_middle_professional, test_middle_student, left_on = 'student_idx', right_on='index')
test_merge_middle_data = pd.merge(test_middle_temp, test_middle_text, on = 'student_idx')

pd.set_option('display.max_columns', None)
display(test_merge_middle_data.head(2))
print(test_merge_middle_data.shape)

In [None]:
drop_cols = ['job_label', 'expert_comment_ko', 'index', 'school_type', 'region', 'gender', 'grade', 'counselling_purpose_x','counseling_idx_1', 'counselling_purpose_y', 'counselling_satisfaction', 'counselling_date']
test_merge_middle_data = test_merge_middle_data.drop(drop_cols, axis = 1)

In [None]:
display(test_merge_middle_data.head(2))
print(test_merge_middle_data.shape)

## 고등학교

In [None]:
import pandas as pd

test_high_student = pd.read_csv('test_학생기초정보데이터_고등학교.csv', encoding = 'utf-8')
test_high_professional = pd.read_csv('test_전문가라벨링데이터_고등학교.csv', encoding = 'utf-8')
test_high_text = pd.read_csv('test_상담기록데이터_고등학교.csv', encoding = 'utf-8')

In [None]:
test_high_temp = pd.merge(test_high_professional, test_high_student, left_on = 'student_idx', right_on='index')
test_merge_high_data = pd.merge(test_high_temp, test_high_text, on = 'student_idx')

pd.set_option('display.max_columns', None)
display(test_merge_high_data.head(2))
print(test_merge_high_data.shape)

In [None]:
drop_cols = ['job_label', 'expert_comment_ko', 'index', 'school_type', 'region', 'gender', 'grade', 'counselling_purpose_x','counseling_idx_1', 'counselling_purpose_y', 'counselling_satisfaction', 'counselling_date']
test_merge_high_data = test_merge_high_data.drop(drop_cols, axis = 1)

In [None]:
display(test_merge_high_data.head(2))
print(test_merge_high_data.shape)

In [None]:
print(test_merge_element_data.shape)
print(test_merge_middle_data.shape)
print(test_merge_high_data.shape)

In [None]:
# DataFrame 결합
test_merged_all_data = pd.concat([test_merge_element_data, test_merge_middle_data, test_merge_high_data], ignore_index=True)

# 결합된 DataFrame의 크기 출력
print(test_merged_all_data.shape)  # (5200, 14)

# 결합된 DataFrame 출력
display(test_merged_all_data)

In [None]:
test_merged_all_data.to_csv('test_merge_모든데이터.csv', index=False, encoding='utf-8-sig')

## test데이터 학습

In [None]:
# 필요한 열만 선택
test_data = test_merged_all_data[['combined_conversation', 'priority1']]

# 텍스트 데이터 전처리
test_data['combined_conversation'] = test_data['combined_conversation'].str.replace('[^\w\s]', '', regex=True)

# TF-IDF 벡터화 (학습된 벡터라이저 사용)
X_test_tfidf = vectorizer.transform(test_data['combined_conversation'])

# 실제 예측
y_test_true = test_data['priority1']
y_test_pred = model.predict(X_test_tfidf)

# 평가
print("Classification Report for priority1 on actual test data:")
print(classification_report(y_test_true, y_test_pred))