In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the CSV files
death_data_path = '사망자수_ADD_full.csv'
elderly_data_path = '노인 인구_full.csv'
cancer_data_path = '암 등록환자현황_full.csv'

death_data = pd.read_csv(death_data_path)
elderly_data = pd.read_csv(elderly_data_path)
cancer_data = pd.read_csv(cancer_data_path)

# Preview the data to understand the structure
death_data.head(), elderly_data.head(), cancer_data.head()

In [None]:
# Check the data types of the columns to identify issues
elderly_data.dtypes, cancer_data.dtypes

In [None]:
# Remove commas and convert to numeric for cancer data
cancer_data.iloc[:, 1:] = cancer_data.iloc[:, 1:].apply(lambda x: x.str.replace(',', '').astype(float))

# Scale the data
scaler = StandardScaler()

elderly_scaled = elderly_data.copy()
elderly_scaled.iloc[:, 1:] = scaler.fit_transform(elderly_scaled.iloc[:, 1:])

cancer_scaled = cancer_data.copy()
cancer_scaled.iloc[:, 1:] = scaler.fit_transform(cancer_scaled.iloc[:, 1:])


In [None]:
# Compare AREA columns to find mismatches
independent_areas = elderly_scaled["AREA"]
dependent_areas = death_data["AREA"]

missing_in_dependent = set(independent_areas) - set(dependent_areas)
missing_in_independent = set(dependent_areas) - set(independent_areas)

missing_in_dependent, missing_in_independent


In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Prepare the data for the Ridge regression model
# Extract independent variables (scaled elderly and cancer data) and dependent variable (death data)
independent_data = pd.concat([elderly_scaled.iloc[:, 1:], cancer_scaled.iloc[:, 1:]], axis=1)
dependent_data = death_data.iloc[:, 1:].apply(lambda x: x.astype(str).str.replace(',', '').astype(float))

In [None]:
# Check the shapes of independent and dependent datasets
independent_data_shape = independent_data.shape
dependent_data_shape = dependent_data.shape

independent_data_shape, dependent_data_shape

In [None]:
# Check the indexes of both datasets to find mismatches
independent_data.index, dependent_data.index

In [None]:
# Compare AREA columns to find mismatches
independent_areas = elderly_scaled["AREA"]
dependent_areas = death_data["AREA"]

missing_in_dependent = set(independent_areas) - set(dependent_areas)
missing_in_independent = set(dependent_areas) - set(independent_areas)

missing_in_dependent, missing_in_independent

In [None]:
# Display the row counts for each dataset to ensure they align correctly
independent_data.reset_index(drop=True, inplace=True)
dependent_data.reset_index(drop=True, inplace=True)

# Check alignment
aligned_check = pd.concat([independent_data, dependent_data], axis=1, join="inner")

aligned_check.shape, independent_data.shape, dependent_data.shape

In [None]:
# Align the datasets by 'AREA'
aligned_independent_data = independent_data[independent_data.index.isin(dependent_data.index)].reset_index(drop=True)

# Verify the new shapes of the datasets
aligned_independent_data.shape, dependent_data.shape


In [None]:
# Convert dependent data to numeric by removing commas and converting to float
dependent_data = death_data.iloc[:, 1:].applymap(lambda x: float(str(x).replace(',', '')))

# Retry splitting and training the Ridge regression model
X_train, X_test, y_train, y_test = train_test_split(
    aligned_independent_data, dependent_data, test_size=0.2, random_state=42
)

ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

y_pred = ridge_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)

mse

In [None]:
# Calculate the R^2 score (accuracy) for the Ridge regression model
r2_score = ridge_model.score(X_test, y_test)

r2_score

In [None]:
# Predict the death population for 2007 to 2023 using the full independent dataset
predicted_death_population = ridge_model.predict(aligned_independent_data)

# Convert predictions to a DataFrame with appropriate column names
predicted_death_df = pd.DataFrame(
    predicted_death_population,
    columns=death_data.columns[1:],
    index=death_data["AREA"]
)

print(predicted_death_df)

In [None]:
death_data