In [23]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import re
from datetime import datetime
from dateutil import parser

def parse_num_of_people(s):
    # '내국인 :0명외국인 :0명 내국인 :0명 외국인 :0명'와 같은 문자열을 처리하기 위해
    # 정규표현식을 이용하여 '내국인 :숫자'와 '외국인 :숫자'를 추출합니다.
    regex = r'(내국인|외국인)\s*:\s*(\d+)명'
    matches = re.findall(regex, s)
    
    # 내국인과 외국인의 총 인원 수를 계산합니다.
    num_internal = 0
    num_foreign = 0
    for match in matches:
        if match[0] == '내국인':
            num_internal += int(match[1])
        elif match[0] == '외국인':
            num_foreign += int(match[1])
    result = (num_internal + num_foreign) / 2
    return result

# 발생 일시에서 월만 뽑아오는 함수
def extract_month(s):
    return int(s.split('-')[1])

# 시설물 종류 대분류 추출 함수
def extract_facility(s):
    return s.split('-')[0]

def extract_ratio(percent_str):
    # 문자열에서 % 값 추출
    percent = re.findall(r'\d+\.?\d*%', percent_str)[0]

    # % 값을 비율 값으로 변환
    if '~' in percent:
        print(percent.split('~')[1].replace('%', ''))
        # 30~39% 와 같은 경우
        ratio = float(percent.split('~')[1].replace('%', '')) / 100
    else:
        # 10% 미만 과 같은 경우
        ratio = float(percent.replace('%', '')) / 100

    return ratio

def extract_population(population_str):
    # 문자열에서 사람 값 추출
    population = re.findall(r'\d+\.?\d*인', population_str)[0]

    if '~' in population:
        # 30~39인 과 같은 경우
        num = int(population.split('~')[1].replace('인', ''))
    else:
        # 10인 미만 과 같은 경우
        num = int(population.replace('인', ''))

    return num


# 날짜 수 세기 함수
def count_days(date_string):
    date_str = date_string.split('(해당공종')[0].strip()

    start_date = date_str.split(' ~ ')[0]
    end_date = date_str.split(' ~ ')[-1]
    start_date = parser.parse(start_date)
    end_date = parser.parse(end_date)

    return (end_date - start_date).days



#CSV 파일을 DataFrame으로 읽어오기


df = pd.read_csv('output.csv')
#필요 없는 특성 제거

allColumns = df.columns
params = ['발생일시','공공/민간 구분', '기상상태', '시설물 종류', '사망자수(명)', '부상자수(명)', '피해금액', '공사비', '공사기간', '공정률', '작업자수', '설계안전성검토']

df = df.drop(allColumns.drop(params), axis=1)

#범주형 데이터를 수치형 데이터로 인코딩

df['사망자수(명)'] = df['사망자수(명)'].apply(parse_num_of_people)
df['부상자수(명)'] = df['부상자수(명)'].apply(parse_num_of_people)
df['발생일시'] = df['발생일시'].apply(extract_month)
df['시설물 종류'] = df['시설물 종류'].apply(extract_facility)
df['공사기간'] = df['공사기간'].apply(count_days)
df['공정률'] = df['공정률'].apply(extract_ratio)
df['작업자수'] = df['작업자수'].apply(extract_population)

# 기상 상태 컬럼 분리
df[['날씨', '기온', '습도']] = df['기상상태'].str.extract('날씨 : (\S+)기온 : (\d+)℃습도 : (\d+)%')
df = df.drop(['기상상태'], axis=1)

# print(df.head())

le = LabelEncoder()
df['공공/민간 구분'] = le.fit_transform(df['공공/민간 구분'])
df['설계안전성검토'] = le.fit_transform(df['설계안전성검토'])
# df['시설물 종류'] = le.fit_transform(df['시설물 종류'])
# df['공종'] = le.fit_transform(df['공종'])
# df['사고객체'] = le.fit_transform(df['사고객체'])
# df['공사종류'] = le.fit_transform(df['공사종류'])

#피해 금액 특성을 원-핫 인코딩

ct = ColumnTransformer([
('ohe', OneHotEncoder(), ['피해금액', '공정률'])
], remainder='passthrough')
X = ct.fit_transform(df)


#컬럼 이름 리스트 생성

num_cols = df.columns.tolist()
ohe = ct.named_transformers_['ohe']
ohe_cols = ohe.get_feature_names_out(['피해금액', '공정률']).tolist()
new_cols = ohe_cols + num_cols
new_cols.remove('공정률')
#DataFrame으로 변환

df = pd.DataFrame(X, columns=new_cols)

X = df.drop(['사망자수(명)'], axis=1)
y = df[['사망자수(명)']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape)
print(X_test.shape)

TypeError: expected string or bytes-like object

In [119]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# LightGBM 데이터셋으로 변환
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

# LightGBM 모델 설정
params = {
    'objective': 'regression',
    'metric': {'rmse', 'mae'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': 6,
    'min_child_weight': 0.1,
    'verbosity': -1
}

# 모델 학습
num_round = 100
bst = lgb.train(params, lgb_train, num_round, valid_sets=[lgb_train, lgb_eval], early_stopping_rounds=10)

# 모델 예측
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

# 예측값과 실제값의 MSE 계산
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)




[1]	training's l1: 26.3878	training's rmse: 29.702	valid_1's l1: 26.5247	valid_1's rmse: 29.8141
Training until validation scores don't improve for 10 rounds
[2]	training's l1: 26.2646	training's rmse: 29.5451	valid_1's l1: 26.4269	valid_1's rmse: 29.6865
[3]	training's l1: 26.1467	training's rmse: 29.4033	valid_1's l1: 26.3373	valid_1's rmse: 29.5781
[4]	training's l1: 26.0345	training's rmse: 29.2737	valid_1's l1: 26.2545	valid_1's rmse: 29.4835
[5]	training's l1: 25.9287	training's rmse: 29.1556	valid_1's l1: 26.1803	valid_1's rmse: 29.399
[6]	training's l1: 25.8277	training's rmse: 29.0503	valid_1's l1: 26.0916	valid_1's rmse: 29.3088
[7]	training's l1: 25.7289	training's rmse: 28.9513	valid_1's l1: 26.0202	valid_1's rmse: 29.2393
[8]	training's l1: 25.6331	training's rmse: 28.8608	valid_1's l1: 25.9525	valid_1's rmse: 29.1805
[9]	training's l1: 25.546	training's rmse: 28.7804	valid_1's l1: 25.8931	valid_1's rmse: 29.131
[10]	training's l1: 25.4623	training's rmse: 28.7043	valid_1'