In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [None]:
dir = '/content/drive/MyDrive/Data Challenge/Dataset/Adults/'

In [None]:
adults = pd.read_csv(dir + 'all_adults.csv')

universities = pd.read_excel(dir + 'universities.xlsx')
professors = pd.read_excel(dir + 'professors.xlsx')


In [None]:
professors

Unnamed: 0,City,2019,2020,2021,2022
0,Қазақстан Республикасы,3914,3432,4500,5911
1,Ақмола,856,835,750,755
2,Ақтөбе,1 550,1 523,1 627,1 690
3,Алматы,604,629,541,261
4,Атырау,708,651,612,495
5,Батыс Қазақстан,1 134,1 170,1 163,1 157
6,Жамбыл,1 186,1 116,1 107,1 124
7,Қарағанды,3 159,2 976,2 942,2 849
8,Қостанай,1 232,1 211,1 089,1 097
9,Қызылорда,673,756,728,767


In [None]:
# Define cities and remove specified cities from analysis
cities = adults['City']

cities_to_remove = ['Абай', 'Жетісу', 'Ұлытау']
cities = cities[~np.isin(cities, cities_to_remove)]

print(cities)

0     Қазақстан Республикасы
1                     Ақмола
2                     Ақтөбе
3                     Алматы
4                     Атырау
5            Батыс Қазақстан
6                     Жамбыл
7                  Қарағанды
8                   Қостанай
9                  Қызылорда
10                 Маңғыстау
11                  Павлодар
12       Солтүстік Қазақстан
13                 Түркістан
14           Шығыс Қазақстан
15             Астана қаласы
16             Алматы қаласы
17            Шымкент қаласы
Name: City, dtype: object


In [None]:
# Drop city column from datasets for further processing
adults = adults.drop(columns=['City'])
universities = universities.drop(columns=['City'])
professors = professors.drop(columns=['City'])

In [None]:
# Clean numeric data function
def clean_numeric(data):
    return data.apply(lambda x: x.astype(str).str.replace(" ", "").str.replace("-", "0").str.replace(",", "").astype(float))

# Apply cleaning to each dataset
adults = clean_numeric(adults)
universities = clean_numeric(universities)
professors = clean_numeric(professors)


In [None]:
professors

Unnamed: 0,2019,2020,2021,2022
0,3914.0,3432.0,4500.0,5911.0
1,856.0,835.0,750.0,755.0
2,1550.0,1523.0,1627.0,1690.0
3,604.0,629.0,541.0,261.0
4,708.0,651.0,612.0,495.0
5,1134.0,1170.0,1163.0,1157.0
6,1186.0,1116.0,1107.0,1124.0
7,3159.0,2976.0,2942.0,2849.0
8,1232.0,1211.0,1089.0,1097.0
9,673.0,756.0,728.0,767.0


In [None]:
adults.head()

adults['City'] = cities
universities['City'] = cities
professors['City'] = cities

In [None]:
# Prepare years for projection
years_past = np.array([2019, 2020, 2021, 2022])
years_future = np.array([2023, 2024, 2025, 2026, 2027, 2028, 2029])

universities_proj = pd.DataFrame(columns=['City',2024, 2025, 2026, 2027, 2028, 2029])
professors_proj = pd.DataFrame(columns=['City',2024, 2025, 2026, 2027, 2028, 2029])

universities_proj['City'] = cities
professors_proj['City'] = cities

# Set a random seed for reproducibility
np.random.seed(42)

In [None]:
# Perform linear regression with random noise for each city and each parameter
for city in cities:
    # Extract historical data for birth, death, and migration rates

    adults_city = adults[adults['City'] == city].iloc[:, :-1].values.flatten()
    universities_city = universities[universities['City'] == city].iloc[:, :-1].values.flatten()
    professors_city = professors[professors['City'] == city].iloc[:, :-1].values.flatten()

    # Reshape years data for linear regression model
    X = years_past.reshape(-1, 1)

    for i in range(len(years_past)):
      universities_city[i] = 100000 * universities_city[i]/adults_city[i]
      professors_city[i] = 100000 * professors_city[i]/adults_city[i]

    universities_model = LinearRegression().fit(X, universities_city[:-1])
    universities_predictions = universities_model.predict(years_future.reshape(-1, 1))
    universities_predictions_noisy = universities_predictions + np.random.uniform(0, 0.05, size=len(universities_predictions)) * universities_predictions
    universities_proj.loc[universities_proj['City'] == city, years_future] = np.round(universities_predictions_noisy)

    professors_model = LinearRegression().fit(X, professors_city)
    professors_predictions = professors_model.predict(years_future.reshape(-1, 1))
    professors_predictions_noisy = professors_predictions + np.random.uniform(0, 0.05, size=len(professors_predictions)) * professors_predictions
    professors_proj.loc[professors_proj['City'] == city, years_future] = np.round(professors_predictions_noisy)


In [None]:
universities_proj

Unnamed: 0,City,2024,2025,2026,2027,2028,2029,2023
0,Қазақстан Республикасы,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,Ақмола,1.0,1.0,1.0,2.0,2.0,2.0,1.0
2,Ақтөбе,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,Алматы,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,Атырау,-0.0,-0.0,-0.0,-1.0,-1.0,-1.0,0.0
5,Батыс Қазақстан,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,Жамбыл,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,Қарағанды,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0
8,Қостанай,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,Қызылорда,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [None]:
all_universities = pd.concat([universities, universities_proj])
all_professors = pd.concat([professors, professors_proj])

In [None]:
all_universities.to_csv(dir + 'universities_proj.csv', index=False)
all_professors.to_csv(dir + 'professors_proj.csv', index=False)