In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [5]:
dir = '/content/drive/MyDrive/Data Challenge/Dataset/Population /'

In [6]:
population = pd.read_csv(dir + 'all_population.csv')

hospitals = pd.read_excel(dir + 'hospitals.xlsx')
doctors = pd.read_excel(dir + 'doctors.xlsx')


In [7]:
doctors

Unnamed: 0,City,2019,2020,2021,2022,2023
0,Республика Казахстан,179837,185757,188800,175617,174817
1,Акмолинская,6361,6497,6557,6438,6344
2,Актюбинская,8093,8150,8108,9523,9513
3,Алматинская,15534,16276,17290,10692,10624
4,Атырауская,5733,6117,6248,6283,6344
5,Западно-Казахстанская,6952,7094,7207,7198,7138
6,Жамбылская,11474,11535,12105,12470,12373
7,Карагандинская,13766,14034,14033,11695,11323
8,Костанайская,7081,7141,6998,6816,6800
9,Кызылординская,10402,10601,10721,10854,11238


In [8]:
# Define cities and remove specified cities from analysis
cities = population['City']

cities_to_remove = ['Абай', 'Жетісу', 'Ұлытау']
cities = cities[~np.isin(cities, cities_to_remove)]

print(cities)

0       Республика Казахстан
1                Акмолинская
2                Актюбинская
3                Алматинская
4                 Атырауская
5      Западно-Казахстанская
6                 Жамбылская
7             Карагандинская
8               Костанайская
9             Кызылординская
10             Мангистауская
11              Павлодарская
12      Северо-Казахстанская
13            Туркестанская*
14    Восточно-Казахстанская
15                 г. Астана
16                 г. Алматы
17                г. Шымкент
Name: City, dtype: object


In [9]:
population = population.drop(columns=['City'])
hospitals = hospitals.drop(columns=['City'])
doctors = doctors.drop(columns=['City'])

In [10]:
# Clean numeric data function
def clean_numeric(data):
    return data.apply(lambda x: x.astype(str).str.replace(" ", "").str.replace("-", "0").str.replace(",", "").astype(float))

population = clean_numeric(population)
hospitals = clean_numeric(hospitals)
doctors = clean_numeric(doctors)


In [11]:
doctors

Unnamed: 0,2019,2020,2021,2022,2023
0,179837.0,185757.0,188800.0,175617.0,174817.0
1,6361.0,6497.0,6557.0,6438.0,6344.0
2,8093.0,8150.0,8108.0,9523.0,9513.0
3,15534.0,16276.0,17290.0,10692.0,10624.0
4,5733.0,6117.0,6248.0,6283.0,6344.0
5,6952.0,7094.0,7207.0,7198.0,7138.0
6,11474.0,11535.0,12105.0,12470.0,12373.0
7,13766.0,14034.0,14033.0,11695.0,11323.0
8,7081.0,7141.0,6998.0,6816.0,6800.0
9,10402.0,10601.0,10721.0,10854.0,11238.0


In [12]:
population.head()

population['City'] = cities
hospitals['City'] = cities
doctors['City'] = cities

In [13]:
# Prepare years for projection
years_past = np.array([2019, 2020, 2021, 2022, 2023])
years_future = np.array([2024, 2025, 2026, 2027, 2028, 2029])

hospitals_proj = pd.DataFrame(columns=['City',2024, 2025, 2026, 2027, 2028, 2029])
doctors_proj = pd.DataFrame(columns=['City',2024, 2025, 2026, 2027, 2028, 2029])

hospitals_proj['City'] = cities
doctors_proj['City'] = cities

# Set a random seed for reproducibility
np.random.seed(42)

In [14]:
# Perform linear regression with random noise for each city and each parameter
for city in cities:
    # Extract historical data for birth, death, and migration rates

    population_city = population[population['City'] == city].iloc[:, :-1].values.flatten()
    hospitals_city = hospitals[hospitals['City'] == city].iloc[:, :-1].values.flatten()
    doctors_city = doctors[doctors['City'] == city].iloc[:, :-1].values.flatten()
    # Reshape years data for linear regression model
    X = years_past.reshape(-1, 1)

    for i in range(len(years_past)):
      hospitals_city[i] = 100000 * hospitals_city[i]/population_city[i]
      doctors_city[i] = 100000 * doctors_city[i]/population_city[i]

    print(hospitals_city)
    hospitals_model = LinearRegression().fit(X, hospitals_city)
    hospitals_predictions = hospitals_model.predict(years_future.reshape(-1, 1))
    hospitals_predictions_noisy = hospitals_predictions + np.random.uniform(0, 0.05, size=len(hospitals_predictions)) * hospitals_predictions
    hospitals_proj.loc[hospitals_proj['City'] == city, years_future] = np.round(hospitals_predictions_noisy)

    doctors_model = LinearRegression().fit(X, doctors_city)
    doctors_predictions = doctors_model.predict(years_future.reshape(-1, 1))
    doctors_predictions_noisy = doctors_predictions + np.random.uniform(0, 0.05, size=len(doctors_predictions)) * doctors_predictions
    doctors_proj.loc[doctors_proj['City'] == city, years_future] = np.round(doctors_predictions_noisy)

[4.07163313 4.14882551 4.09437682 4.19419234 4.4114358 ]
[3.65562892 3.93628645 3.94254221 4.07275985 4.06084671]
[5.05958233 4.99063688 4.91986766 4.58140169 4.52508676]
[3.87457177 3.98886232 3.94616469 2.97599723 2.92184852]
[4.41786015 4.49417307 4.41326414 4.69730976 4.61707828]
[4.44563676 4.41505137 4.38519558 4.39028459 4.35966035]
[4.17614444 4.15892767 4.12573122 4.2987108  4.59710481]
[5.22294731 5.44708987 5.450827   5.37461034 4.67020016]
[4.46840323 4.83565118 4.85801862 5.1454733  5.04665755]
[4.53209288 4.10687329 4.05112769 4.00849802 4.31827614]
[3.83368303 4.4362017  4.30812248 4.69226139 4.43224274]
[4.64281498 4.78615843 4.79353193 5.02305981 5.29840624]
[3.96741669 4.37353646 4.41391487 4.45177338 4.68073634]
[1.96575652 2.13289736 2.1029548  2.25040819 2.64247419]
[4.49755427 4.4538649  4.47280644 4.22939127 4.10824964]
[3.33832846 3.25659505 3.1239156  2.46968653 3.32212179]
[4.96048863 4.74744134 4.60233313 3.66407564 4.07048978]
[3.56758846 3.85300033 3.722779

In [15]:
hospitals_proj

Unnamed: 0,City,2024,2025,2026,2027,2028,2029
0,Республика Казахстан,4.0,5.0,5.0,5.0,5.0,5.0
1,Акмолинская,4.0,4.0,4.0,5.0,5.0,5.0
2,Актюбинская,4.0,4.0,4.0,4.0,4.0,4.0
3,Алматинская,3.0,2.0,2.0,2.0,2.0,1.0
4,Атырауская,5.0,5.0,5.0,5.0,5.0,5.0
5,Западно-Казахстанская,4.0,4.0,4.0,4.0,4.0,4.0
6,Жамбылская,5.0,5.0,5.0,5.0,5.0,5.0
7,Карагандинская,5.0,5.0,5.0,5.0,5.0,4.0
8,Костанайская,5.0,6.0,6.0,6.0,6.0,6.0
9,Кызылординская,4.0,4.0,4.0,4.0,4.0,4.0


In [16]:
doctors_proj

Unnamed: 0,City,2024,2025,2026,2027,2028,2029
0,Республика Казахстан,870.0,875.0,835.0,810.0,755.0,761.0
1,Акмолинская,817.0,793.0,788.0,752.0,740.0,725.0
2,Актюбинская,1088.0,1096.0,1120.0,1201.0,1234.0,1256.0
3,Алматинская,710.0,722.0,681.0,676.0,646.0,634.0
4,Атырауская,955.0,969.0,930.0,935.0,927.0,940.0
5,Западно-Казахстанская,1048.0,1073.0,1027.0,1065.0,1046.0,1008.0
6,Жамбылская,1049.0,1037.0,1076.0,1064.0,1049.0,1035.0
7,Карагандинская,1022.0,1053.0,1057.0,1047.0,1059.0,1046.0
8,Костанайская,829.0,838.0,855.0,828.0,835.0,850.0
9,Кызылординская,1388.0,1412.0,1415.0,1381.0,1437.0,1420.0


In [17]:
all_hospitals = pd.concat([hospitals, hospitals_proj], ignore_index=True)
all_professors = pd.concat([doctors, doctors_proj], ignore_index=True)

In [18]:
all_hospitals.to_csv(dir + 'hospitals_proj.csv', index=False)
all_professors.to_csv(dir + 'doctors_proj.csv', index=False)