In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load data
df_nl = pd.read_csv('Newfoundland_Sectors.csv')
df_pei = pd.read_csv('Prince_Edward_Island_Sector.csv')
df_ns = pd.read_csv('Nova_Scotia_Sector.csv')
df_nb = pd.read_csv('New_Brunswick_Sector.csv')
df_qc = pd.read_csv('Quebec_Sector.csv')
df_ont = pd.read_csv('Ontario_Sector.csv')
df_mb = pd.read_csv('Manitoba_Sector.csv')
df_sask = pd.read_csv('Saskatchewan_Sector.csv')
df_ab = pd.read_csv('Alberta_Sector.csv')
df_bc = pd.read_csv('British_Columbia_Sectors.csv')

# Function to preprocess data, train the model, and predict employment
def predict_employment(df, province_name):
    df = df.dropna(axis=1, how='all')  # Clean empty columns
    industry_columns = df['Industry'].unique()
    
    predictions_df = pd.DataFrame({'Year': range(2024, 2030), 'Province': [province_name]*6})
    
    for industry in industry_columns:
        df_industry = df[df['Industry'] == industry].copy()
        print(f"Processing {industry}, Entries: {len(df_industry)}")  # Debug: check entries count

        if df_industry.empty:
            print(f"No data available for industry {industry}. Skipping...")
            continue

        # Replace non-numeric and problematic strings with '0' before conversion
        df_industry['Employment'] = df_industry['Employment'].replace({'..': '0', 'x': '0'}, regex=True).astype(str).str.replace(',', '').astype(float)

        # Ensure all entries are numeric and handle NaNs after replacement
        df_industry = df_industry.dropna(subset=['Employment'])

        X = df_industry[['Year']]
        y = df_industry['Employment']

        if len(X) < 5:  # Ensure there are enough entries for a train/test split
            print(f"Not enough data to train for {industry}. Needs more than 5, has {len(X)}")
            continue

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
        
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        predictions = model.predict(pd.DataFrame({'Year': range(2024, 2030)}))
        predictions_df[industry] = predictions.round(1)
    
    return predictions_df

# Predict employment for each province
predicted_nl = predict_employment(df_nl, 'Newfoundland')
predicted_pei = predict_employment(df_pei, 'Prince Edward Island')
predicted_ns = predict_employment(df_ns, 'Nova Scotia')
predicted_nb = predict_employment(df_nb, 'New Brunswick')
predicted_qc = predict_employment(df_qc, 'Quebec')
predicted_ont = predict_employment(df_ont, 'Ontario')
predicted_mb = predict_employment(df_mb, 'Manitoba')
predicted_sask = predict_employment(df_sask, 'Saskatchewan')
predicted_ab = predict_employment(df_ab, 'Alberta')
predicted_bc = predict_employment(df_bc, 'British Columbia')

# Combine all predictions into one DataFrame
predicted_combined = pd.concat([predicted_nl, predicted_pei, predicted_ns, predicted_nb, predicted_qc, predicted_ont, predicted_mb, predicted_sask, predicted_ab, predicted_bc], ignore_index=True)



Processing Goods-producing sector , Entries: 48
Processing Agriculture , Entries: 48
Processing Forestry, fishing, mining, quarrying, oil and gas, Entries: 48
Processing nan, Entries: 0
No data available for industry nan. Skipping...
Processing Forestry and logging and support activities for forestry, Entries: 48
Processing Fishing, hunting and trapping, Entries: 48
Processing Mining, quarrying, and oil and gas extraction, Entries: 48
Processing Utilities, Entries: 48
Processing Construction, Entries: 48
Processing Manufacturing, Entries: 48
Processing Durables, Entries: 48
Processing Non-durables, Entries: 48
Processing Services-producing, Entries: 48
Processing Wholesale and retail trade, Entries: 48
Processing Wholesale trade, Entries: 48
Processing Retail trade, Entries: 48
Processing Transportation and warehousing, Entries: 48
Processing Finance, insurance, real estate, rental and leasing, Entries: 48
Processing Finance and insurance, Entries: 48
Processing Real estate and rental 

Processing Health care and social assistance, Entries: 48
Processing Information, culture and recreation, Entries: 48
Processing Accommodation and food services, Entries: 48
Processing Other services (except public administration), Entries: 48
Processing Public administration, Entries: 48
Processing Goods-producing sector , Entries: 48
Processing Agriculture , Entries: 48
Processing Forestry, fishing, mining, quarrying, oil and gas, Entries: 48
Processing nan, Entries: 0
No data available for industry nan. Skipping...
Processing Forestry and logging and support activities for forestry, Entries: 48
Processing Fishing, hunting and trapping, Entries: 48
Processing Mining, quarrying, and oil and gas extraction, Entries: 48
Processing Utilities, Entries: 48
Processing Construction, Entries: 48
Processing Manufacturing, Entries: 48
Processing Durables, Entries: 48
Processing Non-durables, Entries: 48
Processing Services-producing, Entries: 48
Processing Wholesale and retail trade, Entries: 4

In [28]:
predicted_combined.head(10)

Unnamed: 0,Year,Province,Goods-producing sector,Agriculture,"Forestry, fishing, mining, quarrying, oil and gas",Forestry and logging and support activities for forestry,"Fishing, hunting and trapping","Mining, quarrying, and oil and gas extraction",Utilities,Construction,...,Finance and insurance,Real estate and rental and leasing,"Professional, scientific and technical services","Business, building and other support services",Educational services,Health care and social assistance,"Information, culture and recreation",Accommodation and food services,Other services (except public administration),Public administration
0,2024,Newfoundland,46.6,1.4,15.3,1.2,5.7,12.6,2.4,18.8,...,6.5,3.4,11.5,7.7,16.8,43.5,7.6,16.2,11.1,19.1
1,2025,Newfoundland,46.5,1.4,15.3,1.2,5.8,12.9,2.4,19.0,...,6.6,3.4,11.7,7.8,16.9,44.0,7.6,16.4,11.2,19.2
2,2026,Newfoundland,46.4,1.4,15.3,1.2,5.8,13.2,2.4,19.2,...,6.7,3.5,11.9,7.9,16.9,44.6,7.6,16.6,11.2,19.3
3,2027,Newfoundland,46.3,1.4,15.3,1.2,5.8,13.4,2.4,19.3,...,6.8,3.6,12.1,8.0,16.9,45.2,7.7,16.8,11.3,19.4
4,2028,Newfoundland,46.2,1.4,15.3,1.2,5.8,13.7,2.4,19.5,...,6.9,3.6,12.3,8.2,16.9,45.7,7.7,17.0,11.3,19.5
5,2029,Newfoundland,46.1,1.4,15.3,1.2,5.8,14.0,2.4,19.7,...,7.0,3.7,12.5,8.3,16.9,46.3,7.7,17.2,11.4,19.6
6,2024,Prince Edward Island,0.0,4.9,6.1,3.3,5.8,4.5,1.9,4.6,...,7.0,7.3,3.9,3.7,5.6,2.3,4.2,5.9,3.5,2.3
7,2025,Prince Edward Island,0.0,4.9,6.2,3.4,5.9,4.6,1.8,4.5,...,7.1,7.4,3.9,3.7,5.6,2.2,4.2,6.0,3.4,2.2
8,2026,Prince Edward Island,0.0,4.9,6.2,3.4,6.0,4.7,1.8,4.5,...,7.3,7.5,3.9,3.6,5.7,2.1,4.2,6.0,3.4,2.1
9,2027,Prince Edward Island,0.0,4.9,6.3,3.5,6.1,4.8,1.8,4.5,...,7.4,7.6,3.8,3.6,5.7,2.0,4.2,6.0,3.3,2.0


In [29]:
predicted_combined.to_csv('Predicted_Sector_Growth.csv', index=False)