In [None]:
import pandas as pd
import numpy as np

## 열 순서 정렬

In [None]:
df = pd.read_csv("arcr_goutallier.csv")
df

In [None]:
columns = list(df.columns)

In [None]:
static_columns = columns[:16]
static_columns

In [None]:
seq_columns = columns[16:-8]
seq_columns = sorted(seq_columns)

In [None]:
print(seq_columns[0:12])
print(seq_columns[12:19])
print(seq_columns[19:31])
print(seq_columns[31:43])
print(seq_columns[43:])

In [None]:
goutallier_columns = columns[-8:]
goutallier_columns

In [None]:
target_columns = static_columns + seq_columns + goutallier_columns
ordered_df = df[target_columns]
ordered_df.columns = target_columns
ordered_df

## NaN 포함 행 제거

In [None]:
df = ordered_df

In [None]:
columns = df.columns

In [None]:
# nan이 포함되도 괜찮은 열
nan_columns = list(columns[10:13]) + list(columns[-8:])
nan_columns

In [None]:
# nan이 포함되면 안되는 열
not_nan_columns = set(columns) - set(nan_columns)

In [None]:
dropped_df = ordered_df.dropna(subset=not_nan_columns)
dropped_df

## 타입 불일치 값을 NaN으로 변경

In [None]:
df = dropped_df

In [None]:
columns = list(df.columns)

In [None]:
object_columns = columns[:3] + columns[10:12]
object_columns

In [None]:
integer_columns = columns[3:8] + columns[12:13]
integer_columns

In [None]:
float_columns = columns[8:10] + columns[13:]
float_columns

In [None]:
len(columns) == len(object_columns) + len(integer_columns) + len(float_columns)

### Integer Columns

In [None]:
casted_df = df.copy()

In [None]:
df[integer_columns].info()

In [None]:
# 실수형 1.0, 2.0이 기록됨
df["Rt:1,Lt:2"].value_counts()

In [None]:
pd.to_numeric(df["Rt:1,Lt:2"]).astype(np.int8)

In [None]:
casted_df["Rt:1,Lt:2"] = pd.to_numeric(df["Rt:1,Lt:2"]).astype(np.int8)

In [None]:
# 문자열 0, 1이 기록됨
df["POD 2M retear (no:0,retear:1)"].map(type).value_counts()

In [None]:
df["POD 2M retear (no:0,retear:1)"].value_counts()

In [None]:
pd.to_numeric(df["POD 2M retear (no:0,retear:1)"]).astype(np.int8)

In [None]:
casted_df["POD 2M retear (no:0,retear:1)"] = pd.to_numeric(df["POD 2M retear (no:0,retear:1)"]).astype(np.int8)

In [None]:
# 실수형 0.0, 1.0이 기록됨
df["POD 6M retear"].value_counts()

In [None]:
pd.to_numeric(df["POD 6M retear"]).astype(np.int8)

In [None]:
casted_df["POD 6M retear"] = pd.to_numeric(df["POD 6M retear"]).astype(np.int8)

In [None]:
# 실수형 0.0, 1.0이 기록됨
mask = ~df["흡연여부 (비흡연:1,흡연:2)"].isna()
df.loc[mask, "흡연여부 (비흡연:1,흡연:2)"].value_counts()

In [None]:
pd.to_numeric(df.loc[mask, "흡연여부 (비흡연:1,흡연:2)"]).astype(np.int8)

In [None]:
casted_df.loc[mask, "흡연여부 (비흡연:1,흡연:2)"] = pd.to_numeric(df.loc[mask, "흡연여부 (비흡연:1,흡연:2)"]).astype(np.int8)

In [None]:
casted_df[integer_columns].info()

### Float Columns

In [None]:
df[float_columns].info()

In [None]:
df[float_columns].apply(pd.to_numeric, errors="coerce").info()

In [None]:
casted_df[float_columns] = df[float_columns].apply(pd.to_numeric, errors="coerce")

In [None]:
dropped_casted_df = casted_df.dropna(subset=not_nan_columns)
dropped_casted_df

## 저장

In [None]:
dropped_casted_df.to_csv("arcr_cleaned.csv", index=False)