### 年収を平均化

In [2]:
import polars as pl
import numpy as np

### Cleaning df

In [3]:
# Cleaning df2:
file2 = "/Users/lex/CodeProjects/MyProject/Mitaron/共産平均年収.csv"
df2 = pl.read_csv(file2)

df2 = df2.with_columns(
    pl.col("平均年収")
    .cast(pl.Int64)
    .alias("平均年収")
)
df2.write_csv("共産平均年収.csv")

### Merging & Labeling

In [None]:
# Sources
file1 = "/Users/lex/CodeProjects/MyProject/Mitaron/会社平均年収.csv"
# file2 = "/Users/lex/CodeProjects/MyProject/Mitaron/共産平均年収.csv"

# Reading Files
df1 = pl.scan_csv(file1)
df2 = pl.scan_csv(file2)

# Merging
merged = pl.concat([df1, df2])

In [None]:
# Defining the Edges 
min_salary = merged.select(pl.col("平均年収").min()).collect()[0,0]
step_bins = np.arange(min_salary, 900, 50).tolist()
custom_bins = [900, 1000, 1250, 1e12]
bins = step_bins + custom_bins

print(f"Number of bins: {len(bins)}")
print(f"Bins: {bins}")

# First, use cut without labels
final = merged.with_columns(
    pl.col("平均年収")
    .cut(breaks=bins)
    .alias("平均年収")  # Overwrite the original column
).collect()

# Transforming the Interval into the Label: 
final = final.with_columns(
    pl.col("平均年収")
    .cast(pl.Utf8)
    .str.replace_all(r"^\((\d+), (\d+)\]$", "$1 - $2")
    .str.replace(r"^\(1250, .*\]$", "1250+")
    .alias("平均年収")
    )

# Write to CSV
final.write_csv("最終平均年収.csv")

Number of bins: 13
Bins: [450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 1000, 1250, 1000000000000.0]
