In [1]:
# import libraries
import pandas as pd
import researchpy as rp
from statsmodels.stats.diagnostic import het_white
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

Data source: Portal Site of Official Statistics of Japan website (https://www.e-stat.go.jp/), System of Social and Demographic Statistics (Municipality data)

In [2]:
# data description
#data = pd.read_csv("/Users/Chloe/Desktop/Uni (2023-2024)/UN Datathon 2023/Processed Data/datav2.csv")
#data.info()  # data information
#rp.codebook(data)  # descriptive statistics for each variable

In [3]:
# collecting inputs from the user
def input_ivs():
    input_list = []
    how_many = input("How many IVs will you have? ")
    for i in range(int(how_many)):
        iv = input(f"What is the column name for your IV{int(i)+1}? ")
        input_list.append(iv)
    return input_list
iv_list = input_ivs()

# applicable function to run the model
def marginal_village_model(data_path, year_column, city_column, dv, iv_list = iv_list):
    """
    data_path (str) - local path to the data file
    year_column (str) - column within the data for year
    city_column (str) - column within the data for city
    iv_list (list) - list of strings with the name of columns for IV (from input_ivs function)
    dv (str) - column within the data for DV
    """
    data = pd.read_csv(data_path)
    for iv in iv_list:
        statistics, _ = stats.pearsonr(data[iv], data[dv])
        if statistics < 0.3 or statistics > -0.3:
            print(f"{iv} and {dv} seems to not have strong linear relationship, which may violate the assumption of independence.")
    cities = data[city_column].unique()
    models_list = {}
    for city in cities:
        data_model = data.loc[data[city_column] == city]
        lin_reg = LinearRegression()
        linreg = lin_reg.fit(data_model.loc[:, iv_list], data_model.loc[:, dv])
        models_list[city] = [[linreg.intercept_], list(linreg.coef_), [linreg]]
    return models_list

# prints out weighting of each ivs
def weighting_marginal_village(city, models_list, iv_list = iv_list):  
    """
    city (str) - city name
    models_list - dictionary of city (key) and model (value). output from function marginal_village_model.
    """
    print("-----")
    for i in range(len(iv_list)):
        print(f"Weighting for {iv_list[i]}: {models_list[city][1][i]}")
    print("-----")

# predict proportion of 65+ years old population
def predict_marginal_village(city, models_list, iv_list = iv_list):  
    """
    city (str) - city name
    models_list - dictionary of city (key) and model (value). output from function marginal_village_model.
    """
    iv_inputs = []
    for iv in iv_list:
        input_user = input(f"Please enter the {iv} for the {city} to predict (same unit as the dataset): ")
        iv_inputs.append(float(input_user))
    params = models_list[city]
    prediction = models_list[city][0][0]
    for i in range(len(iv_inputs)):
        prediction += iv_inputs[i]*models_list[city][1][i]
    return prediction

How many IVs will you have? 3
What is the column name for your IV1? year
What is the column name for your IV2? education
What is the column name for your IV3? percEmpl


In [4]:
# users need to edit codes below
models_list = marginal_village_model(data_path = "./datav2.csv", year_column = "year", city_column = "mun", dv = "percOldPop")
weighting_marginal_village("Yamanashi-ken Yamanashi-shi", models_list)
predict_marginal_village("Yamanashi-ken Yamanashi-shi", models_list)

year and percOldPop seems to not have strong linear relationship, which may violate the assumption of independence.
education and percOldPop seems to not have strong linear relationship, which may violate the assumption of independence.
percEmpl and percOldPop seems to not have strong linear relationship, which may violate the assumption of independence.
-----
Weighting for year: 0.6246566200677964
Weighting for education: 0.01739841217080179
Weighting for percEmpl: 0.3691937041877254
-----
Please enter the year for the Yamanashi-ken Yamanashi-shi to predict (same unit as the dataset): 2021
Please enter the education for the Yamanashi-ken Yamanashi-shi to predict (same unit as the dataset): 3
Please enter the percEmpl for the Yamanashi-ken Yamanashi-shi to predict (same unit as the dataset): 0.5


15.831508671391436