In [14]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 
import datetime

In [2]:
df = pd.read_csv('Population.csv')

In [4]:
# Rename the column
df.rename(columns={'Geography': 'Province'}, inplace=True)

# Filter out rows
df_filtered = df[~df['Province'].isin(['Canada', 'Nunavut'])]

# Save to new CSV file
df_filtered.to_csv('population_modified.csv', index=False)

# Print the first few rows to verif
df_filtered.head()

Unnamed: 0,Province,Quarter,Year,Population
289,Newfoundland and Labrador,1,1952,368000
290,Newfoundland and Labrador,2,1952,371000
291,Newfoundland and Labrador,3,1952,375000
292,Newfoundland and Labrador,4,1952,377000
293,Newfoundland and Labrador,1,1953,379000


In [None]:
predicted_df = pd.DataFrame()

In [21]:
def predict_for_group(group, name):
    # Clean and prepare the data
    group = group.dropna(subset=['Population'])
    group['Population'] = group['Population'].replace(',', '', regex=True)
    group['Population'] = pd.to_numeric(group['Population'], errors='coerce')
    group.dropna(subset=['Population'], inplace=True)

    if group.empty:
        print(f"No valid data available for {name}. Skipping...")
        return pd.DataFrame()

    X = group[['Year', 'Quarter']]
    y = group['Population']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict for future quarters
    max_year = group['Year'].max()
    max_quarter = group['Quarter'].max()
    future_years = [max_year + i//4 for i in range(1, 21)]
    future_quarters = [(max_quarter + i) % 4 or 4 for i in range(1, 21)]
    future_df = pd.DataFrame({'Year': future_years, 'Quarter': future_quarters})

    # Calculate predicted population and round to whole numbers
    future_df['Predicted Population'] = model.predict(future_df[['Year', 'Quarter']]).round(0).astype(int)
    future_df['Province'] = name

    return future_df

predicted_df = pd.DataFrame()
groups = df.groupby('Province')

# Iterate through each group and perform predictions
for name, group in groups:
    prediction = predict_for_group(group, name)
    predicted_df = pd.concat([predicted_df, prediction], ignore_index=True)

# Output the results
predicted_df.to_csv('Predicted_Population.csv', index=False)