In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import hvplot.pandas 

# Load data 
df_ont = pd.read_csv('Ontario Sector.csv')

# Drop rows with missing or non-numeric values in the 'Employment' column
df_ont = df_ont[pd.to_numeric(df_ont['Employment'], errors='coerce').notnull()]

# Split the data into features (X) and target variable (y)
X = df_ont[['Year', 'Industry']]
y = df_ont['Employment'].astype(float)  # Convert 'Employment' to float

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# One-hot encode the 'Industry' column
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X_train_encoded = ct.fit_transform(X_train)
X_test_encoded = ct.transform(X_test)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train_encoded, y_train)

# Create a dataframe with the Cartesian product of years and unique industries
years = range(2024, 2031)
industries = df_ont['Industry'].unique()
future_years = pd.DataFrame([(year, industry) for year in years for industry in industries], columns=['Year', 'Industry'])

# One-hot encode the 'Industry' column in future_years
future_years_encoded = ct.transform(future_years[['Year', 'Industry']])

# Predict future employment for the next 5 years
future_predictions = model.predict(future_years_encoded)

# Assign the corresponding industries to future_years
future_years['Predicted Employment Ontario '] = future_predictions.round(1)

# Plot the predicted employment using hvplot
plot = future_years.hvplot(x='Year', y='Predicted Employment Ontario ', by='Industry', kind='line', xlabel='Year', ylabel='Predicted Employment', title='Ontario Predicted Employment by Industry')
plot

In [8]:
# Save the predicted future employment with industries to a CSV file
future_years.to_csv('predicted_Ontario_employment_sector.csv', index=False)

future_years

Unnamed: 0,Year,Industry,Predicted Employment Ontario
0,2024,Agriculture,195.5
1,2024,"Forestry, fishing, mining, quarrying, oil and gas",142.3
2,2024,Forestry and logging and support activities fo...,81.8
3,2024,"Fishing, hunting and trapping",111.2
4,2024,"Mining, quarrying, and oil and gas extraction",105.1
...,...,...,...
170,2030,Health care and social assistance,705.4
171,2030,"Information, culture and recreation",376.2
172,2030,Accommodation and food services,455.7
173,2030,Other services (except public administration),365.5


In [13]:
df_ont

Unnamed: 0,Industry,Year,Employment
