In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import hvplot.pandas 

# Load data 
df_nb = pd.read_csv('New Brunswick Sector.csv')

# Drop rows with missing or non-numeric values in the 'Employment' column
df_nb = df_nb[pd.to_numeric(df_nb['Employment'], errors='coerce').notnull()]

# Split the data into features (X) and target variable (y)
X = df_nb[['Year', 'Industry']]
y = df_nb['Employment'].astype(float)  # Convert 'Employment' to float

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# One-hot encode the 'Industry' column
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X_train_encoded = ct.fit_transform(X_train)
X_test_encoded = ct.transform(X_test)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train_encoded, y_train)

# Create a dataframe with the Cartesian product of years and unique industries
years = range(2024, 2031)
industries = df_nb['Industry'].unique()
future_years = pd.DataFrame([(year, industry) for year in years for industry in industries], columns=['Year', 'Industry'])

# One-hot encode the 'Industry' column in future_years
future_years_encoded = ct.transform(future_years[['Year', 'Industry']])

# Predict future employment for the next 5 years
future_predictions = model.predict(future_years_encoded)

# Assign the corresponding industries to future_years
future_years['Predicted Employment New Brunswick '] = future_predictions.round(1)

# Save the predicted future employment with industries to a CSV file
future_years.to_csv('predicted_New Brunswick_employment_sector.csv', index=False)

# Plot the predicted employment using hvplot
plot = future_years.hvplot(x='Year', y='Predicted Employment New Brunswick ', by='Industry', kind='line', xlabel='Year', ylabel='Predicted Employment', title='New Brunswick  Predicted Employment by Industry')
plot
