In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the uploaded CSV files
birth_data = pd.read_csv('시군구_출생아수_full.csv')
fertile_women_data = pd.read_csv('가임 여성수_full_2.csv')
education_cost_data = pd.read_csv('사교육비 총액_full.csv')
income_data = pd.read_csv('feat_sal.csv')
housing_price_data = pd.read_csv('집값(평균)_full.csv')
marriage_data = pd.read_csv('혼인건수_full.csv')

# Displaying the heads of the datasets to understand their structures
birth_data.head(), fertile_women_data.head(), education_cost_data.head(), income_data.head(), housing_price_data.head(), marriage_data.head()


In [None]:
# Checking unique values in AREA and Year for each dataset
birth_areas = set(birth_long["AREA"])
fertile_women_areas = set(fertile_women_long["AREA"])
education_cost_areas = set(education_cost_long["AREA"])
income_areas = set(income_long["AREA"])
housing_price_areas = set(housing_price_long["AREA"])
marriage_areas = set(marriage_long["AREA"])

# Areas mismatch check
area_mismatches = {
    "Birth vs Fertile Women": birth_areas - fertile_women_areas,
    "Fertile Women vs Education Cost": fertile_women_areas - education_cost_areas,
    "Education Cost vs Income": education_cost_areas - income_areas,
    "Income vs Housing Price": income_areas - housing_price_areas,
    "Housing Price vs Marriage": housing_price_areas - marriage_areas,
}

# Checking unique Year values in each dataset
birth_years = set(birth_long["Year"])
fertile_women_years = set(fertile_women_long["Year"])
education_cost_years = set(education_cost_long["Year"])
income_years = set(income_long["Year"])
housing_price_years = set(housing_price_long["Year"])
marriage_years = set(marriage_long["Year"])

# Year mismatch check
year_mismatches = {
    "Birth vs Fertile Women": birth_years - fertile_women_years,
    "Fertile Women vs Education Cost": fertile_women_years - education_cost_years,
    "Education Cost vs Income": education_cost_years - income_years,
    "Income vs Housing Price": income_years - housing_price_years,
    "Housing Price vs Marriage": housing_price_years - marriage_years,
}

area_mismatches, year_mismatches


In [None]:
# Ensuring Year is uniformly integer in all datasets
def ensure_year_as_int(df, year_column):
    """
    Convert the Year column to integer type if not already.
    """
    df[year_column] = df[year_column].astype(int)
    return df

# Apply to all datasets
birth_long = ensure_year_as_int(birth_long, "Year")
fertile_women_long = ensure_year_as_int(fertile_women_long, "Year")
education_cost_long = ensure_year_as_int(education_cost_long, "Year")
income_long = ensure_year_as_int(income_long, "Year")
housing_price_long = ensure_year_as_int(housing_price_long, "Year")
marriage_long = ensure_year_as_int(marriage_long, "Year")

# Reattempt merging datasets
merged_data = birth_long.merge(fertile_women_long, on=["AREA", "Year"], how="inner")
merged_data = merged_data.merge(education_cost_long, on=["AREA", "Year"], how="inner")
merged_data = merged_data.merge(income_long, on=["AREA", "Year"], how="inner")
merged_data = merged_data.merge(housing_price_long, on=["AREA", "Year"], how="inner")
merged_data = merged_data.merge(marriage_long, on=["AREA", "Year"], how="inner")

# Scaling the data
scaled_features = scaler.fit_transform(merged_data.iloc[:, 2:])

# Create a scaled dataframe
scaled_data = pd.DataFrame(scaled_features, columns=merged_data.columns[2:])
scaled_data.insert(0, "AREA", merged_data["AREA"])
scaled_data.insert(1, "Year", merged_data["Year"])


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Extracting features (independent variables) and target (dependent variable)
X = scaled_data.drop(columns=["AREA", "Year", "Births"])  # Independent variables
y = scaled_data["Births"]  # Dependent variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and fitting the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display results
mse, r2


In [None]:
import matplotlib.pyplot as plt

# Plotting true vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.7, edgecolors='k')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', linewidth=2)
plt.title("True vs Predicted Births")
plt.xlabel("True Births")
plt.ylabel("Predicted Births")
plt.grid(alpha=0.5)
plt.show()

In [None]:
# Assuming 'tools' is supposed to be 'IPython.display'
from IPython.display import display

# ... your existing code ...

# Predicting births for the years 2007-2023 using the existing scaled data
predicted_births = model.predict(X)

# Adding the predictions to the dataframe
scaled_data_with_predictions = scaled_data.copy()
scaled_data_with_predictions["Predicted_Births"] = predicted_births

# Instead of tools.display_dataframe_to_user, use:
display(scaled_data_with_predictions)