In [2]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('population.csv')

print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Number of rows: 227
Number of columns: 10


In [4]:
missing_values = df.isnull().sum()
print(missing_values)

Районы                           0
Общее население                  0
мужчины                          0
женщины                          0
городское население            124
городское население мужчины    124
городское население женщины    124
сельское население              34
сельское население мужчины      34
сельское население женщины      34
dtype: int64


In [6]:
df_imputed = df.copy()

numeric_cols = ['Общее население', 'мужчины', 'женщины', 
                'городское население', 'городское население мужчины', 'городское население женщины',
                'сельское население', 'сельское население мужчины', 'сельское население женщины']

print("\nCleaning numeric columns...")
for col in numeric_cols:
    if df_imputed[col].dtype == 'object':
        df_imputed[col] = df_imputed[col].str.replace(',', '').str.replace(' ', '')
        df_imputed[col] = pd.to_numeric(df_imputed[col], errors='coerce')
        print(f"Converted '{col}' to numeric format")

print("\nHandling missing values...")

urban_cols = ['городское население', 'городское население мужчины', 'городское население женщины']
for col in urban_cols:
    missing_count = df_imputed[col].isnull().sum()
    if missing_count > 0:
        median_val = df_imputed[col].median()
        df_imputed[col].fillna(median_val, inplace=True)
        print(f"Filled {missing_count} missing values in '{col}' with median: {median_val:,.0f}")

rural_cols = ['сельское население', 'сельское население мужчины', 'сельское население женщины']
for col in rural_cols:
    missing_count = df_imputed[col].isnull().sum()
    if missing_count > 0:
        median_val = df_imputed[col].median()
        df_imputed[col].fillna(median_val, inplace=True)
        print(f"Filled {missing_count} missing values in '{col}' with median: {median_val:,.0f}")

remaining_missing = df_imputed.isnull().sum().sum()
print(f"\nMissing values after imputation: {remaining_missing}")

if remaining_missing > 0:
    print("Columns with remaining missing values:")
    print(df_imputed.isnull().sum()[df_imputed.isnull().sum() > 0])


Cleaning numeric columns...
Converted 'Общее население' to numeric format
Converted 'мужчины' to numeric format
Converted 'женщины' to numeric format
Converted 'городское население' to numeric format
Converted 'городское население мужчины' to numeric format
Converted 'городское население женщины' to numeric format
Converted 'сельское население' to numeric format
Converted 'сельское население мужчины' to numeric format
Converted 'сельское население женщины' to numeric format

Handling missing values...
Filled 124 missing values in 'городское население' with median: 52,200
Filled 124 missing values in 'городское население мужчины' with median: 26,449
Filled 124 missing values in 'городское население женщины' with median: 26,636
Filled 34 missing values in 'сельское население' with median: 22,875
Filled 34 missing values in 'сельское население мужчины' with median: 11,385
Filled 34 missing values in 'сельское население женщины' with median: 11,119

Missing values after imputation: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

In [8]:
df_imputed['Урбанизация (%)'] = (df_imputed['городское население'] / df_imputed['Общее население']) * 100
df_imputed['Доля мужчин в городе (%)'] = (df_imputed['городское население мужчины'] / df_imputed['городское население']) * 100
df_imputed['Доля женщин в городе (%)'] = (df_imputed['городское население женщины'] / df_imputed['городское население']) * 100
df_imputed['Доля мужчин в селе (%)'] = (df_imputed['сельское население мужчины'] / df_imputed['сельское население']) * 100
df_imputed['Доля женщин в селе (%)'] = (df_imputed['сельское население женщины'] / df_imputed['сельское население']) * 100

print("Created new derived features:")
print("- Урбанизация (%): Percentage of urban population")
print("- Доля мужчин/женщин в городе/селе: Gender distribution percentages")

df_imputed = df_imputed.replace([np.inf, -np.inf], np.nan)
df_imputed = df_imputed.fillna(0)

Created new derived features:
- Урбанизация (%): Percentage of urban population
- Доля мужчин/женщин в городе/селе: Gender distribution percentages


In [None]:
columns_to_analyze = {
    'Total Population': 'Общее население',
    'Urban Population': 'городское население', 
    'Rural Population': 'сельское население'
}

results = {}
for eng_name, rus_name in columns_to_analyze.items():
    results[eng_name] = {
        'Mean': df_imputed[rus_name].mean(),
        'Median': df_imputed[rus_name].median(),
        'Std': df_imputed[rus_name].std(),
        'Min': df_imputed[rus_name].min(),
        'Max': df_imputed[rus_name].max()
    }

for col, stats in results.items():
    print(f"{col}:")
    print(f"  Mean: {stats['Mean']:,.2f}")
    print(f"  Median: {stats['Median']:,.2f}")
    print(f"  Standard Deviation: {stats['Std']:,.2f}")
    print(f"  Range: {stats['Min']:,.0f} - {stats['Max']:,.0f}")
    print(f"  Difference (Mean-Median): {abs(stats['Mean'] - stats['Median']):,.2f}")
    print("-" * 50)


Statistical Summary:
Total Population:
  Mean: 88,809.28
  Median: 39,631.00
  Standard Deviation: 108,569.58
  Range: 5,092 - 576,517
  Difference (Mean-Median): 49,178.28
--------------------------------------------------
Urban Population:
  Mean: 84,155.71
  Median: 52,200.00
  Standard Deviation: 97,215.68
  Range: 2,202 - 576,517
  Difference (Mean-Median): 31,955.71
--------------------------------------------------
Rural Population:
  Mean: 36,594.32
  Median: 22,875.00
  Standard Deviation: 42,754.18
  Range: 90 - 260,533
  Difference (Mean-Median): 13,719.32
--------------------------------------------------
