# COMP 3610 Project

### A PREDICTIVE MODEL FOR ELECTORAL OUTCOMES IN TRINIDAD & TOBAGO USING MACROECONOMIC INDICATORS

- Christophe Gittens  
- Maia Neptune       
- Zidane Timothy     

In [None]:
# initialize Program
# %pip install requests beautifulsoup4 python-csv
# %pip install pandas

import pandas as pd
import numpy as np
import csv
import os
import requests
from bs4 import BeautifulSoup

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
csvs_path = "csvs"

# Loading the datasets

## Electoral Data

Electoral data was sourced from the [Elections and Boundaries Commission](https://ebctt.com) and collated into a single excel file and subsequently processed in the election_results.ipynb Jupyter notebook.

* Election data encompases the 41 constituinces that were redistricted in 2007


### Elections used in our predictive models -->

 - 2007
 - 2010
 - 2015
 - 2020

In [None]:
election_results = pd.read_excel('csvs/collated-tt-election-results.xlsx', sheet_name=['2007', '2010', '2015', '2020', '2025'], skiprows=[0,1,2,3,4])

# headers = 
# Access individual sheets using sheet names
sheet_1 = election_results['2007']
sheet_2 = election_results['2010']
sheet_3 = election_results['2015']
sheet_4 = election_results['2020']
sheet_5 = election_results['2025']  # blank sheet for 2025 with electorate data

In [None]:
sheet_1.to_csv('election2007.csv')

2015

Sourcing macroeconomic data from the World Bank API. 
We will examine the the impact of the following:
Trinidad and Tobago's Debt to GDP ratio.
Trinidad and Tobago's GDP.
Trinidad and Tobago's Inflation.
Trinidad and Tobago's Government Expenditure
Trinidad and Tobago's Unemployment Rate. 

### Debt to GDP
Source: [worldbank](api.worldbank.org) and the [CBTT](https://www.central-bank.org.tt/statistics/data-centre)

In [None]:
cols = ["Date", "Central Government Domestic Debt to GDP Ratio (%)",
        "Central Government External Debt to GDP Ratio (%)",
        "Central Government Total Debt to GDP Ratio (%)",
        "Contingent Liabilities Debt to GDP Ratio (%)",
        "Gross Public Sector Debt to GDP Ratio (%)", 
        "Net Public Sector Debt to GDP Ratio (%)"
        ]
df_debt_annual = pd.read_csv(r'csvs/Debt Annual  Central Bank of Trinidad and Tobago.csv',
                             usecols = cols, low_memory = True)
df_debt_annual

In [None]:
# url = 'https://api.worldbank.org/v2/country/TT/indicator/GC.XPN.TOTL.GD.ZS?format=json'
# response = requests.get(url)

# if response:
#     data = response.json()
#     tt_debt = data[1]
#     df = pd.DataFrame(tt_debt)
#     df.to_csv(os.path.join(csvs_path,r'tt_debt_to_gdp.csv'))

# else:
#     print("Response is not 200.")

### GDP
Source: [Worldbank](api.worldbank.org)

In [None]:


# url = 'https://api.worldbank.org/v2/country/TT/indicator/NY.GDP.MKTP.CD?format=json'
# response = requests.get(url)

# if response:
#     data = response.json()
#     tt_gdp = data[1]
#     df_gdp = pd.DataFrame(tt_gdp)
#     df_gdp.to_csv(os.path.join(csvs_path,r'tt_gdp.csv'))

# else:
#     print("Response is not 200.")

### Inflation
Source: [worldbank](api.worldbank.org) and the [CBTT](https://www.central-bank.org.tt/statistics/data-centre)

In [None]:
cols = ["Date", "Date",
        "Annual Average Percent Change in the Index of Retail Prices - Inflation Rate (%)",
        "Unemployment Rate (%)","WTI Crude Oil Price (US$/bbl)",
        "Henry Hub Natural Gas Price (US$/mmbtu)",
        "Net Official Reserves (US$Mn)"
        ]
df_inflation_annual = pd.read_csv(r'csvs/Selected Economic Indicators Annual  Central Bank of Trinidad and Tobago.csv',
                             usecols = cols, low_memory = True)

In [None]:
url = 'https://api.worldbank.org/v2/country/TT/indicator/FP.CPI.TOTL.ZG?format=json'
response = requests.get(url)

if response:
    data = response.json()
    tt_inflation = data[1]
    df_inflation = pd.DataFrame(tt_inflation)
    df_inflation.to_csv(os.path.join(csvs_path,r'tt_inflation.csv'))

else:
    print("Response is not 200 for inflation.")


### Unemployment rate
Source:  [worldbank](api.worldbank.org)

In [None]:
cols = ["Date", "Unemployment Rate (%)"
        ]
tt_unemployment = pd.read_csv(r'csvs/Selected Economic Indicators Annual  Central Bank of Trinidad and Tobago.csv',
                             usecols = cols, low_memory = True)

In [None]:
url = 'https://api.worldbank.org/v2/country/TT/indicator/SL.UEM.TOTL.ZS?format=json'
response = requests.get(url)

if response:
    data = response.json()
    tt_unemployment = data[1]
    df = pd.DataFrame(tt_unemployment)
    df.to_csv(os.path.join(csvs_path,r'tt_unemployment_rate.csv'))

else:
    print("Response is not 200.")

### Government expenditure

Source: [worldbank](api.worldbank.org) and [CBTT](https://www.central-bank.org.tt/statistics/data-centre)

In [None]:
df_expenditure = pd.read_csv("csvs/Public Finance Annual  Central Bank of Trinidad and Tobago.csv")

In [None]:
# url = 'https://api.worldbank.org/v2/country/TT/indicator/GC.XPN.TOTL.GD.ZS?format=json'
# response = requests.get(url)

# if response:
#     data = response.json()
#     tt_expenditure = data[1]
#     df_expenditure = pd.DataFrame(tt_expenditure)
#     df_expenditure.to_csv(os.path.join(csvs_path,r'tt_gov_expenditure.csv'))

# else:
#     print("Response is not 200.")

### Homicide rate
Source: Web-Scraping from [macrotrends](macrotrends.net)

In [None]:


# url ='https://www.macrotrends.net/global-metrics/countries/TTO/trinidad-and-tobago/murder-homicide-rate'

# headers = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
# }

# response = requests.get(url, headers=headers)

# if response:
#     soup = BeautifulSoup(response.content, 'html.parser')

#     div = soup.find('div', {'class': 'col-xs-6'})
#     crime_data =[]
#     if div:
#         table = div.find_all('table', {'class':'historical_data_table table table-striped table-bordered'})
#         table = table[0]
#     if table:
#         for row in table.find_all('tr')[2:]:
#             cells = row.find_all('td')
#             if len(cells) == 3:
#                 year = cells[0].get_text(strip=True)
#                 rate_per_100k = cells[1].get_text(strip=True)
#                 annual_change = cells[2].get_text(strip=True)
#                 crime_data.append([year,rate_per_100k,annual_change])

#         df_homicide_rate = pd.DataFrame(crime_data, columns=['Year','Rate_per_100k_Population','Annual_change'])

#         df_homicide_rate.to_csv(os.path.join(csvs_path,r'tt_homicide_rate.csv'), index=False)
# else:
#     print('Response is not 200.')

### Migration Rates
Source: Web-Scraping from [macrotrends](macrotrends.net)

In [None]:


# url = "https://www.macrotrends.net/global-metrics/countries/TTO/trinidad-and-tobago/net-migration#:~:text=The%20net%20migration%20rate%20for,a%200.35%25%20decline%20from%202022."
# headers = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
# }

# response = requests.get(url, headers=headers)

# if response.status_code == 200:
#     soup = BeautifulSoup(response.text, "html.parser")
#     # print(soup.prettify())  # Check page structure
# else:
#     print("Failed to fetch page:", response.status_code)

# tables = soup.find_all("table")

# # ensure there are at least 2 tables before accessing the second one
# if len(tables) >= 2:
#     second_table = tables[1]  # Get the second table
#     df_migration_rate = pd.read_html(str(second_table))[0]
#     df_migration_rate.columns = ["Year", "Net Migration Rate", "Growth Rate"]
#     print(df_migration_rate)
# else:
#     print("Second table not found!")

# df_migration_rate.to_csv(os.path.join(csvs_path, r'tt_migration_rate.csv'), index=False)


### Currency Exchange Rate
Source: [CBTT](https://www.central-bank.org.tt/statistics/data-centre)

In [None]:
df_exchange_rate = pd.read_csv("csvs/Exchange Rates Annual  Central Bank of Trinidad and Tobago (1).csv")
df_exchange_rate

# Preprocessing

From the central bank website n.d. is No Data or NA so that is taken in consideration moving forward. 

#### Preprocessing of electoral data

In [None]:
sheet_1

In [None]:
sheet_1.isna().sum()

In [None]:
# sht_07, sht_10, sht_15, sht_20 = sheet_2007.iloc[3:], sheet_2010.iloc[3:], sheet_2015.iloc[3:], sheet_2020.iloc[3:]

# print(sht_07)
# print(sht_10)
# print(sht_15)
# print(sht_20)
# sht_20

In [None]:
sheet_1.shape

Fixing headers

### 2007 Parliamentary Election

In [None]:
cols= [
            'ELECTORAL_DISTRICT', 'ELECTORATE', 'TOTAL_NUMBER_OF_VOTES_CAST',
            'VOTER_TURNOUT', 'REJECTED_BALLOTS', 'VALID_VOTES', 'P.N.M._VOTES',
            'P.N.M._VOTES_%', 'U.N.C._VOTES','U.N.C._VOTES_%', 'C.O.P._VOTES', 
            'C.O.P._VOTES_%', 'D.A.C._VOTES', 'D.A.C._VOTES_%', 'D.N.A._VOTES', 
            'D.N.A._VOTES_%', 'I.N.D_VOTES', 'I.N.D._VOTES_%'
        ]
sht_07 = sheet_1
sht_07.columns = cols
sht_07.fillna(0, inplace=True)
sht_07

### 2010 Parliamentary Election

In [None]:
cols = [
            'ELECTORAL_DISTRICT', 'ELECTORATE', 'TOTAL_NUMBER_OF_VOTES_CAST',
            'VOTER_TURNOUT', 'REJECTED_BALLOTS', 'VALID_VOTES', 'P.N.M._VOTES',
            'P.N.M._VOTES_%', 'U.N.C._VOTES','U.N.C._VOTES_%', 'C.O.P._VOTES', 
            'C.O.P._VOTES_%', 'N.N.V._VOTES', 'N.N.V._VOTES_%', 'T.O.P._VOTES', 
            'T.O.P._VOTES_%', 'T.H.C._VOTES', 'T.H.C._VOTES_%', 'I.N.D_VOTES', 
            'I.N.D._VOTES_%', 'T.T.N.C.P._VOTES', 'T.T.N.C.P._VOTES_%'
        ]

sht_10 = sheet_2
sht_10.columns = cols
sht_10.fillna(0, inplace=True)
sht_10

### 2015 Parliamentary Election

In [None]:
cols = [
            'ELECTORAL_DISTRICT', 'ELECTORATE', 'TOTAL_NUMBER_OF_VOTES_CAST',
            'VOTER_TURNOUT', 'REJECTED_BALLOTS', 'VALID_VOTES', 'P.N.M._VOTES',
            'P.N.M._VOTES_%', 'U.N.C._VOTES','U.N.C._VOTES_%', 'C.O.P._VOTES', 
            'C.O.P._VOTES_%', 'I.L.P._VOTES', 'I.L.P._VOTES_%', 'N.J.A.C._VOTES',
            'N.J.A.C._VOTES_%','D.D.P._VOTES', 'D.D.P._VOTES_%', 'I.D.P._VOTES',
            'I.D.P._VOTES_%', 'I.N.D_VOTES', 'I.N.D._VOTES_%', 'L.O.V.E._VOTES',
            'L.O.V.E._VOTES_%', 'N.C.T._VOTES', 'N.C.T._VOTES_%','N.N.V._VOTES',
            'N.N.V._VOTES_%','T.F._VOTES', 'T.F._VOTES_%', 'T.O.P._VOTES', 
            'T.O.P._VOTES_%', 'T.H.C._VOTES', 'T.H.C._VOTES_%', 'T.N.V._VOTES',
            'T.N.V._VOTES_%', 'T.P.T._VOTES', 'T.P.T._VOTES_%', 'Y.E.P._VOTES', 
            'Y.E.P._VOTES_%', 'Y.O.U.R._VOTES', 'Y.O.U.R._VOTES_%',
        ]
sht_15 = sheet_3
sht_15.columns = cols
sht_15.fillna(0, inplace=True)
sht_15

### 2020 Parliamentary Election

In [None]:
cols = [
            'ELECTORAL_DISTRICT', 'ELECTORATE', 'TOTAL_NUMBER_OF_VOTES_CAST',
            'VOTER_TURNOUT', 'REJECTED_BALLOTS', 'VALID_VOTES', 'P.N.M._VOTES',
            'P.N.M._VOTES_%', 'U.N.C._VOTES','U.N.C._VOTES_%','P.E.P._VOTES', 
            'P.E.P._VOTES_%', 'C.O.P._VOTES', 'C.O.P._VOTES_%', 'I.L.P._VOTES', 
            'I.L.P._VOTES_%', 'D.P.T._VOTES', 'D.P.T._VOTES_%', 'I.N.D_VOTES', 
            'I.N.D._VOTES_%','M.N.D._VOTES', 'M.N.D._VOTES_%', 'M.S.J._VOTES',
            'M.S.J._VOTES_%', 'N.C.T._VOTES', 'N.C.T._VOTES_%','N.N.V._VOTES',
            'N.N.V._VOTES_%','N.O.W._VOTES', 'N.O.W._VOTES_%', 'O.T.V._VOTES', 
            'O.T.V._VOTES_%', 'P.D.P._VOTES', 'P.D.P._VOTES_%', 'P.P._VOTES', 
            'P.P._VOTES_%', 'T.D.F._VOTES', 'T.D.F._VOTES_%', 'T.H.C._VOTES', 
            'T.H.C._VOTES_%', 'T.N.P._VOTES', 'T.N.P._VOTES_%', 'U.P.P._VOTES', 
            'U.P.P._VOTES_%', 'U.T.P._VOTES', 'U.T.P._VOTES_%'
        ]
sht_20 = sheet_4
sht_20.columns = cols
sht_20.fillna(0, inplace=True)
sht_20

In [None]:
cols = [
            'ELECTORAL_DISTRICT', 'ELECTORATE', 'TOTAL_NUMBER_OF_VOTES_CAST',
            'VOTER_TURNOUT', 'REJECTED_BALLOTS', 'VALID_VOTES', 'P.N.M._VOTES',
            'P.N.M._VOTES_%', 'U.N.C._VOTES','U.N.C._VOTES_%','P.E.P._VOTES', 
            'P.E.P._VOTES_%', 'C.O.P._VOTES', 'C.O.P._VOTES_%','M.N.D._VOTES', 
            'M.N.D._VOTES_%', 'P.D.P._VOTES', 'P.D.P._VOTES_%'
        ]
sht_25 = sheet_5.copy()
sht_25.columns = cols
columns_to_fill = [col for col in sht_25.columns if col not in ['ELECTORAL_DISTRICT', 'ELECTORATE']]
sht_25[columns_to_fill] = 0

# Display the updated DataFrame
sht_25

## Debt to GDP cleaning

In [None]:
df_debt_annual

In [None]:
df_debt_annual.info()

In [None]:
df_debt_annual["Date"] = df_debt_annual["Date"].astype("int64")
df_debt_annual.info()

In [None]:
# Replace String values with numeric values and convert to float
df_debt_annual[["Contingent Liabilities Debt to GDP Ratio (%)",
                "Gross Public Sector Debt to GDP Ratio (%)",
                "Net Public Sector Debt to GDP Ratio (%)"]] = df_debt_annual[["Contingent Liabilities Debt to GDP Ratio (%)",
                "Gross Public Sector Debt to GDP Ratio (%)",
                "Net Public Sector Debt to GDP Ratio (%)"]].replace("n.d",0)

df_debt_annual[["Contingent Liabilities Debt to GDP Ratio (%)",
                "Gross Public Sector Debt to GDP Ratio (%)",
                "Net Public Sector Debt to GDP Ratio (%)"]] = df_debt_annual[["Contingent Liabilities Debt to GDP Ratio (%)",
                "Gross Public Sector Debt to GDP Ratio (%)",
                "Net Public Sector Debt to GDP Ratio (%)"]].astype(float)

In [None]:
df_debt_annual.info()

In [None]:
df_debt_annual.describe()

In [None]:
df_debt_annual.isna().sum()

In [None]:
df_debt_annual.isna().any()

The data for debt to gdp looks cleaned enough. 

## GDP

Soure: [CSO](https://cso.gov.tt/subjects/national-accounts/)

In [None]:
df_gdp = pd.read_csv("csvs/tt_gdp.csv")

In [None]:
df_gdp.info()

In [None]:
df_gdp["date"] = df_gdp["date"].astype("int64")
df_gdp.info()

In [None]:
df_gdp.isna().sum()

In [None]:
df_gdp.isna().all()

In [None]:
df_gdp.describe()

## Inflation

Source: [Macrotrends](https://www.macrotrends.net/global-metrics/countries/TTO/trinidad-and-tobago/inflation-rate-cpi)

In [None]:
df_inflation

In [None]:
df_inflation.info()

In [None]:
df_inflation["date"] = df_inflation['date'].astype("int64")
df_inflation.info()

In [None]:
df_inflation.isna().sum()

In [None]:
df_inflation.isna().all()

The commented code is for inflation extracted from the Central Bank of Trinidad and Tobago (CBTT) and is more concise

In [None]:
# df_inflation_annual.info()

In [None]:
# df_inflation_annual.describe()

In [None]:
# df_inflation_annual.isna().sum()

## Unemployment Values cleaning

In [None]:
df_unemployment = pd.read_csv("csvs/tt_unemployment_rate.csv")

# Convert all values in df_unemployment to more readable values
df_unemployment['value'] = df_unemployment['value'].apply(lambda x: round(x, 2))
df_unemployment.rename(columns={'value': 'Unemployment Rate (%)'}, inplace=True)
df_unemployment

In [None]:
df_unemployment = df_unemployment.drop(columns={"unit", "obs_status", "decimal"})
df_unemployment

In [None]:
df_unemployment.info()

In [None]:
df_unemployment["date"] = df_unemployment['date'].astype("int64")
df_unemployment.info()

In [None]:
df_unemployment.isna().sum()

Since we are only looking at 2000/2002 onward we can drop all the empty rows since they are not part of our dataset

In [None]:
df_unemployment.dropna(subset=['Unemployment Rate (%)'], inplace=True)
df_unemployment.isna().sum()

In [None]:
df_unemployment.isna().all()

In [None]:
df_unemployment

In [None]:
df_unemployment.describe()

The commented code is for inflation extractedd from the Central Bank of Trinidad and Tobago (CBTT) and is more concise

In [None]:
# df_unemployment_annual.info()

In [None]:
# df_unemployment_annual.describe()

In [None]:
# df_unemployment_annual.isna().sum()

In [None]:
# df_unemployment_annual.isna().any()

## Government Expenditure

In [None]:
df_expenditure.info()

In [None]:
df_expenditure['Date'] = df_expenditure['Date'].astype("int64")
df_expenditure.info()

In [None]:
df_expenditure.isna().sum()

In [None]:
df_expenditure.isna().all()

Given the only important missing values are 2020 and 2000 expenditure values, we can source those two and drop the other rows that we are empty

## Homicide Rate

In [None]:
df_homicide_rate = pd.read_csv(r"csvs/tt_homicide_rate.csv") #rate per 100k for 2023 and 2023 sourced from statista.com others from macrotrend.com

#pct_change will make Annual_change row for first value null
df_homicide_rate['Annual_change'] = df_homicide_rate["Annual_change"].replace(to_replace=' ', value=0)
df_homicide_rate['Annual_change'] = df_homicide_rate["Annual_change"].str.rstrip('%').astype('float')
last_homicide_row = df_homicide_rate.iloc[-1:]

df_homicide_rate= df_homicide_rate.sort_values(by='Year')
df_homicide_rate['Annual_change'] = (df_homicide_rate['Rate_per_100k_Population'].pct_change()* 100).round(2) 
df_homicide_rate= df_homicide_rate.sort_values(by='Year', ascending=False)

df_homicide_rate.iloc[-1:] = last_homicide_row

df_homicide_rate

In [None]:
df_homicide_by_div = pd.read_csv(r'csvs/tt_murders_by_division_2013_2024.csv')
df_homicide_by_div['Year'] = df_homicide_by_div['Year'].astype('int64')
df_homicide_by_div = pd.merge(df_homicide_rate, df_homicide_by_div, on='Year', how='outer')


## Constituency/Division Specific Homicide Information
Incase accuracy is low. This will be employed for the SVM.

In [None]:
# #melting the df
df_long_homicide_by_div = df_homicide_by_div.melt(id_vars=['Year', 'Rate_per_100k_Population', 'Annual_change','Total_murders_per_year'], 
                  var_name='Division', 
                  value_name='Murders')

In [None]:
def murder_estimate(division, df_long):
    division_data = df_long[df_long['Division'] == division].copy()

    murders_2013 = division_data[division_data['Year'] == 2013]['Murders'].values[0]

    for year in range(2012, 1999, -1):
        try:
            pct_change = division_data[division_data['Year'] == year + 1]['Annual_change'].values[0]
        except IndexError:
            continue  

        projected_murders = (murders_2013 / (1 + pct_change / 100)).round()

        mask = (df_long['Year'] == year) & (df_long['Division'] == division)
        if df_long.loc[mask, 'Murders'].isna().any():
            df_long.loc[mask, 'Murders'] = projected_murders
        else:
            new_row = {
                'Year': year,
                'Division': division,
                'Rate_per_100k_Population': np.nan,
                'Annual_change': np.nan,
                'Murders': projected_murders,
                'Annual_change': np.nan
            }
            df_long = pd.concat([df_long, pd.DataFrame([new_row])], ignore_index=True)

        murders_2013 = projected_murders

    return df_long

In [None]:
divisions = df_long_homicide_by_div['Division'].dropna().unique()

for division in divisions:
    df_long_homicide_by_div = murder_estimate(division, df_long_homicide_by_div)

In [None]:
#testing to see if data is accurate
#quote "When murder data are restricted to 2001-2013, the
# largest proportion of murders in Trinidad and Tobago took place in the Port of Spain Division
# (26.7 per cent). This was followed by the Northern Division (19.9 per cent), North Eastern
# Division (13.3 per cent), and Western Division (12.3 per cent)."

subset = df_long_homicide_by_div[(df_long_homicide_by_div['Year'] >= 2001) & (df_long_homicide_by_div['Year'] <= 2013)]

total_by_division = subset.groupby('Division')['Murders'].sum()
grand_total = total_by_division.sum()

division_percentages = (total_by_division / grand_total) * 100
division_percentages = division_percentages.sort_values(ascending=False)
print(division_percentages)



From: 1. Crime-Trinidad and Tobago. 2. Crime prevention-Trinidad and Tobago. 3. ViolenceTrinidad and Tobago. 4. Violence-Prevention-Trinidad and Tobago. I. Inter-American
Development Bank. Country Department Caribbean Group. II. Title. III. Series.
IDB-TN-1062 

"When murder data are restricted to 2001-2013, the largest proportion of murders in Trinidad and Tobago took place in the Port of Spain Division (26.7 per cent). This was followed by the Northern Division (19.9 per cent), North Eastern Division (13.3 per cent), and Western Division (12.3 per cent)."

The difference in percentage in Port-Of-Spain can be accounted for due to the 2013 murder count starting off at 122 murders and slightly dropping. 
 

## Migration Rates

In [None]:
df_migration_rate = pd.read_csv("csvs/tt_migration_rate.csv")

In [None]:
df_migration_rate.info()

In [None]:
df_migration_rate['Year'] = df_migration_rate['Year'].astype('int64')
df_migration_rate.info()

In [None]:
df_migration_rate["Growth Rate"] = (
    df_migration_rate["Growth Rate"]
    .str.rstrip('%')
    .astype('float')
).round(4)

df_migration_rate = df_migration_rate.rename(columns={
    "Growth Rate": "Growth Rate(in percent)",
    "Year": "Year",
    "Net Migration Rate": "Net Migration Rate"
})

df_migration_rate

In [None]:
df_migration_rate.isna().sum()

In [None]:
df_migration_rate.isna().any()

In [None]:
df_migration_rate.describe()

# Putting Together Datasets for model

In [None]:
# Ensure all date/year columns are named consistently for merging
df_debt_annual.rename(columns={"Date": "Year"}, inplace=True)
df_gdp.rename(columns={"date": "Year"}, inplace=True)
df_inflation.rename(columns={"date": "Year"}, inplace=True)
df_unemployment.rename(columns={"date": "Year"}, inplace=True)
df_expenditure.rename(columns={"Date": "Year"}, inplace=True)
df_migration_rate.rename(columns={"Year": "Year"}, inplace=True)
df_homicide_rate.rename(columns={"Year": "Year"}, inplace=True)

# Merge all dataframes on the "Year" column
dataset = df_debt_annual.merge(df_gdp, on="Year", how="outer") \
                          .merge(df_inflation, on="Year", how="outer") \
                          .merge(df_unemployment, on="Year", how="outer") \
                          .merge(df_expenditure, on="Year", how="outer") \
                          .merge(df_migration_rate, on="Year", how="outer") \
                          .merge(df_homicide_rate, on="Year", how="outer")

# Display the merged dataframe
dataset

In [None]:
# Rename columns to more meaningful names
dataset.rename(columns={
    'id_x': 'Debt_ID',
    'id_y': 'GDP_ID',
    'value_x': 'Debt_Value',
    'value_y': 'GDP_Value',
    'value': 'Inflation_Value'
}, inplace=True)

# Display the updated dataset
dataset.head()

Dropping all the rows and columns we do not need. In this instance values that are after 2020 and before 2000

In [None]:
dataset.dropna(subset=['Net Public Sector Debt to GDP Ratio (%)'], inplace=True)
dataset.dropna(subset=['Rate_per_100k_Population'], inplace=True)
dataset.drop(columns={"unit_x", "obs_status_x", "unit_y", "country_y", "country_x"}, inplace=True)

dataset

In [None]:
dataset.info()

In [None]:
print(dataset.corr)

# Determining values for Model

Compiling Unemployment Average ratio from 2007 to 2010, 2010 to 2015 and 2015 to 2020

In [None]:
# calculate averages for the specified periods
unemployment_avg_07_10 = dataset[(dataset['Year'] >= 2007) & (dataset['Year'] <= 2010)]['Annual_change'].mean()
unemployment_avg_10_15 = dataset[(dataset['Year'] > 2010) & (dataset['Year'] <= 2015)]['Annual_change'].mean()
unemployment_avg_15_20 = dataset[(dataset['Year'] > 2015) & (dataset['Year'] <= 2020)]['Annual_change'].mean()
unemployment_avg_20_25 = dataset[(dataset['Year'] > 2020) & (dataset['Year'] <= 2024)]['Annual_change'].mean()

df_unemp_avg_07_10 = pd.DataFrame({'Period': ['2007-2010'], 'Average Unemployment Rate (%)': [unemployment_avg_07_10]})
df_unemp_avg_10_15 = pd.DataFrame({'Period': ['2010-2015'], 'Average Unemployment Rate (%)': [unemployment_avg_10_15]})
df_unemp_avg_15_20 = pd.DataFrame({'Period': ['2015-2020'], 'Average Unemployment Rate (%)': [unemployment_avg_15_20]})
df_unemp_avg_20_25 = pd.DataFrame({'Period': ['2020-2024'], 'Average Unemployment Rate (%)': [unemployment_avg_20_25]})

print(df_unemp_avg_07_10)
print(df_unemp_avg_10_15)
print(df_unemp_avg_15_20)
print(df_unemp_avg_20_25)

In [None]:
# calculate averages for the specified periods
inflation_avg_07_10 = dataset[(dataset['Year'] >= 2007) & (dataset['Year'] <= 2010)]['Annual_change'].mean()
inflation_avg_10_15 = dataset[(dataset['Year'] > 2010) & (dataset['Year'] <= 2015)]['Annual_change'].mean()
inflation_avg_15_20 = dataset[(dataset['Year'] > 2015) & (dataset['Year'] <= 2020)]['Annual_change'].mean()
inflation_avg_20_25 = dataset[(dataset['Year'] > 2020) & (dataset['Year'] <= 2024)]['Annual_change'].mean()

df_inf_avg_07_10 = pd.DataFrame({'Period': ['2007-2010'], 'Average Inflation Rate (%)': [inflation_avg_07_10]})
df_inf_avg_10_15 = pd.DataFrame({'Period': ['2010-2015'], 'Average Inflation Rate (%)': [inflation_avg_10_15]})
df_inf_avg_15_20 = pd.DataFrame({'Period': ['2015-2020'], 'Average Inflation Rate (%)': [inflation_avg_15_20]})
df_inf_avg_20_25 = pd.DataFrame({'Period': ['2020-2024'], 'Average Inflation Rate (%)': [inflation_avg_20_25]})

print(df_inf_avg_07_10)
print(df_inf_avg_10_15)
print(df_inf_avg_15_20)
print(df_inf_avg_20_25)

## Electoral Elasticity Calculations 

Our electoral elasticity variable is determined by a series of mathematical formulae outlined in this <a href = "Data/Election_Forecasting_Using_Macroeconomic.pdf"> Research Paper. </a>

In [None]:
# get the change in vote share for P.N.M., U.N.C., and C.O.P. between constituencies for the different years
sht_10_share = sht_10[['ELECTORAL_DISTRICT', 'P.N.M._VOTES_%', 'U.N.C._VOTES_%', 'C.O.P._VOTES_%']]
sht_07_share = sht_07[['ELECTORAL_DISTRICT', 'P.N.M._VOTES_%', 'U.N.C._VOTES_%', 'C.O.P._VOTES_%']]
sht_15_share = sht_15[['ELECTORAL_DISTRICT', 'P.N.M._VOTES_%', 'U.N.C._VOTES_%', 'C.O.P._VOTES_%']]
sht_20_share = sht_20[['ELECTORAL_DISTRICT', 'P.N.M._VOTES_%', 'U.N.C._VOTES_%', 'C.O.P._VOTES_%']]



Calculate differences in vote percentages

In [None]:
# calculate the difference in vote share
diff_1 = sht_07_share.set_index('ELECTORAL_DISTRICT').sub(sht_10_share.set_index('ELECTORAL_DISTRICT'), axis=0)
diff_1.reset_index(inplace=True)
diff_1.rename(columns={
	'P.N.M._VOTES_%': 'P.N.M._VOTES_%_CHANGE',
	'U.N.C._VOTES_%': 'U.N.C._VOTES_%_CHANGE',
	'C.O.P._VOTES_%': 'C.O.P._VOTES_%_CHANGE'
}, inplace=True)

diff_2 = sht_10_share.set_index('ELECTORAL_DISTRICT').sub(sht_15_share.set_index('ELECTORAL_DISTRICT'), axis=0)
diff_2.reset_index(inplace=True)
diff_2.rename(columns={
	'P.N.M._VOTES_%': 'P.N.M._VOTES_%_CHANGE',
	'U.N.C._VOTES_%': 'U.N.C._VOTES_%_CHANGE',
	'C.O.P._VOTES_%': 'C.O.P._VOTES_%_CHANGE'
}, inplace=True)

diff_3 = sht_15_share.set_index('ELECTORAL_DISTRICT').sub(sht_20_share.set_index('ELECTORAL_DISTRICT'), axis=0)
diff_3.reset_index(inplace=True)
diff_3.rename(columns={
	'P.N.M._VOTES_%': 'P.N.M._VOTES_%_CHANGE',
	'U.N.C._VOTES_%': 'U.N.C._VOTES_%_CHANGE',
	'C.O.P._VOTES_%': 'C.O.P._VOTES_%_CHANGE'
}, inplace=True)




Output of Elasticity Change in  Percentage

In [None]:
print(diff_1)


In [None]:
print(diff_2)

In [None]:
print(diff_3)

## Creation of master dataframe and model training follows

The master DataFrame consolidates various social, electoral and economic data for Trinidad and Tobago over a range of years. It is designed to provide a comprehensive dataset for analysis and modeling. Below is a breakdown of the key components and columns in the DataFrame:

### Columns:
1. **Year**: The year corresponding to the data.
2. **GDP (Per Capita US$)**: The Gross Domestic Product per capita in US dollars.
3. **GNI (Per Capita US $)**: The Gross National Income per capita in US dollars.
4. **GNI (Percentage Change)**: The year-over-year percentage change in GNI.
5. **Homicide_rate_per_100k**: The homicide rate per 100,000 people.
6. **Inflation**: Rate of Chnage in prices.
7. **Unemployment**: Rate of unemployment.
8. **Net Migrantion**: The difference between those leaving and those coming to reside in Trinidad and Tobago.

In [None]:
socioeconomic_df = pd.read_csv(r"csvs/socio-economic-indicators-collated copy.csv")
socioeconomic_df.columns = socioeconomic_df.columns.str.strip()

In [None]:
tt_gdp_df = pd.read_csv(r"csvs/tt_gdp.csv")
tt_gdp_df['date'] = tt_gdp_df['date'].astype("int64")

tt_gdp_df = tt_gdp_df.rename(columns={"date": "Year"})
tt_gdp_df = tt_gdp_df.rename(columns={"value": "GDP_(US$)"})

tt_gdp_df = tt_gdp_df.drop(columns={"country", "countryiso3code", "unit", "indicator", "obs_status", "decimal", "Unnamed: 0"})

#only keeping years 2000-2025
tt_gdp_df = tt_gdp_df[tt_gdp_df['Year'].between(2000, 2024)]

In [None]:
socioeconomic_df = pd.merge(socioeconomic_df, tt_gdp_df, on='Year', how='outer')

In [None]:

socioeconomic_df["GDP (Per Capita TT$)"] = socioeconomic_df["GDP (Per Capita TT$)"].str.replace(',', '').astype(float)

#to USD
socioeconomic_df["GDP (Per Capita TT$)"] = (socioeconomic_df["GDP (Per Capita TT$)"] / 6.8).round(2)
socioeconomic_df = socioeconomic_df.rename(columns={"GDP (Per Capita TT$)": "GDP (Per Capita US$)"})
# Display the updated dataframe
socioeconomic_df

In [None]:
socioeconomic_df.to_csv(r"csvs/socioeconomic_df_gdp_added.csv", index=False)

In [None]:
def calculate_one_year_growth(df, column_name):
    df[f'{column_name}_1y'] = np.log(df[column_name] / df[column_name].shift(1))
    print(df)
    return df

In [None]:
def calculate_one_term_growth(df, column_name, election_year_gap):
    df[f'{column_name}_1t'] = np.log(df[column_name] / df[column_name].shift(election_year_gap))
    print(df)
    return df

In [None]:
div_elec_df = pd.read_csv('csvs/district_division.csv')

In [None]:
homicide_term_growth = df_long_homicide_by_div.copy()
homicide_term_growth = calculate_one_term_growth(homicide_term_growth, 'Murders', 7)
homicide_07 = homicide_term_growth[homicide_term_growth['Year'] == 2007] 
homicide_07 = homicide_07.drop(columns={"Rate_per_100k_Population", "Annual_change", "Total_murders_per_year",'Year'})

In [None]:
homicide_term_growth = df_long_homicide_by_div.copy()
homicide_term_growth = calculate_one_term_growth(homicide_term_growth, 'Murders', 10)
homicide_10 = homicide_term_growth[homicide_term_growth['Year'] == 2010]
homicide_10 = homicide_10.drop(columns={"Rate_per_100k_Population", "Annual_change", "Total_murders_per_year",'Year'})

In [None]:
homicide_term_growth = df_long_homicide_by_div.copy()
homicide_term_growth = calculate_one_term_growth(homicide_term_growth, 'Murders', 15)
homicide_15 = homicide_term_growth[homicide_term_growth['Year'] == 2015]
homicide_15 = homicide_15.drop(columns={"Rate_per_100k_Population", "Annual_change", "Total_murders_per_year",'Year'})

In [None]:
homicide_term_growth = df_long_homicide_by_div.copy()
homicide_term_growth = calculate_one_term_growth(homicide_term_growth, 'Murders', 20)
homicide_20 = homicide_term_growth[homicide_term_growth['Year'] == 2020]
homicide_20 = homicide_20.drop(columns={"Rate_per_100k_Population", "Annual_change", "Total_murders_per_year",'Year'})

In [None]:
homicide_term_growth = df_long_homicide_by_div.copy()
homicide_term_growth = calculate_one_term_growth(homicide_term_growth, 'Murders', 24)
homicide_25 = homicide_term_growth[homicide_term_growth['Year'] == 2024]
homicide_25 = homicide_25.drop(columns={"Rate_per_100k_Population", "Annual_change", "Total_murders_per_year",'Year'})

In [None]:
sht_07['Year'] = 2007
soc_info_07 = socioeconomic_df
soc_info_07 = calculate_one_term_growth(soc_info_07, 'GNI (Per Capita US $)', 7)
soc_info_07 = calculate_one_year_growth(soc_info_07, 'GNI (Per Capita US $)')
soc_info_07 = calculate_one_term_growth(soc_info_07, 'Inflation', 7)
soc_info_07 = calculate_one_year_growth(soc_info_07, 'Inflation')
soc_info_07 = calculate_one_term_growth(soc_info_07, 'Unemployment', 7)
soc_info_07 = calculate_one_year_growth(soc_info_07, 'Unemployment')
soc_info_07 = calculate_one_term_growth(soc_info_07, 'Net Migration Rate', 7)
soc_info_07 = calculate_one_year_growth(soc_info_07, 'Net Migration Rate')
soc_info_07 = calculate_one_term_growth(soc_info_07, 'GDP (Per Capita US$)', 7)
soc_info_07 = calculate_one_year_growth(soc_info_07, 'GDP (Per Capita US$)')
soc_info_07 = calculate_one_term_growth(soc_info_07, 'GDP_(US$)', 7)
soc_info_07 = calculate_one_year_growth(soc_info_07, 'GDP_(US$)')
soc_07 = soc_info_07[soc_info_07['Year'] == 2007]


In [None]:
sht_10['Year'] = 2010
soc_info_10 = socioeconomic_df
soc_info_10 = calculate_one_term_growth(soc_info_10, 'GNI (Per Capita US $)', 10)
soc_info_10 = calculate_one_year_growth(soc_info_10, 'GNI (Per Capita US $)')
soc_info_10 = calculate_one_term_growth(soc_info_10, 'Inflation', 10)
soc_info_10 = calculate_one_year_growth(soc_info_10, 'Inflation')
soc_info_10 = calculate_one_term_growth(soc_info_10, 'Unemployment', 10)
soc_info_10 = calculate_one_year_growth(soc_info_10, 'Unemployment')
soc_info_10 = calculate_one_term_growth(soc_info_10, 'Net Migration Rate', 10)
soc_info_10 = calculate_one_year_growth(soc_info_10, 'Net Migration Rate')
soc_info_10 = calculate_one_term_growth(soc_info_10, 'GDP (Per Capita US$)', 10)
soc_info_10 = calculate_one_year_growth(soc_info_10, 'GDP (Per Capita US$)')
soc_info_10 = calculate_one_term_growth(soc_info_10, 'GDP_(US$)', 10)
soc_info_10 = calculate_one_year_growth(soc_info_10, 'GDP_(US$)')
soc_10 = soc_info_10[soc_info_10['Year'] == 2010]


In [None]:
sht_15['Year'] = 2015
soc_info_15 = socioeconomic_df
soc_info_15 = calculate_one_term_growth(soc_info_15, 'GNI (Per Capita US $)', 15)
soc_info_15 = calculate_one_year_growth(soc_info_15, 'GNI (Per Capita US $)')
soc_info_15 = calculate_one_term_growth(soc_info_15, 'Inflation', 15)
soc_info_15 = calculate_one_year_growth(soc_info_15, 'Inflation')
soc_info_15 = calculate_one_term_growth(soc_info_15, 'Unemployment', 15)
soc_info_15 = calculate_one_year_growth(soc_info_15, 'Unemployment')
soc_info_15 = calculate_one_term_growth(soc_info_15, 'Net Migration Rate', 15)
soc_info_15 = calculate_one_year_growth(soc_info_15, 'Net Migration Rate')
soc_info_15 = calculate_one_term_growth(soc_info_15, 'GDP (Per Capita US$)', 15)
soc_info_15 = calculate_one_year_growth(soc_info_15, 'GDP (Per Capita US$)')
soc_info_15 = calculate_one_term_growth(soc_info_15, 'GDP_(US$)', 15)
soc_info_15 = calculate_one_year_growth(soc_info_15, 'GDP_(US$)')
soc_15 = soc_info_15[soc_info_15['Year'] == 2015]

In [None]:
sht_20['Year'] = 2020
soc_info_20 = socioeconomic_df
soc_info_20 = calculate_one_term_growth(soc_info_20, 'GNI (Per Capita US $)', 20)
soc_info_20 = calculate_one_year_growth(soc_info_20, 'GNI (Per Capita US $)')
soc_info_20 = calculate_one_term_growth(soc_info_20, 'Inflation', 20)
soc_info_20 = calculate_one_year_growth(soc_info_20, 'Inflation')
soc_info_20 = calculate_one_term_growth(soc_info_20, 'Unemployment', 20)
soc_info_20 = calculate_one_year_growth(soc_info_20, 'Unemployment')
soc_info_20 = calculate_one_term_growth(soc_info_20, 'Net Migration Rate', 20)
soc_info_20 = calculate_one_year_growth(soc_info_20, 'Net Migration Rate')
soc_info_20 = calculate_one_term_growth(soc_info_20, 'GDP (Per Capita US$)', 20)
soc_info_20 = calculate_one_year_growth(soc_info_20, 'GDP (Per Capita US$)')
soc_info_20 = calculate_one_term_growth(soc_info_20, 'GDP_(US$)', 20)
soc_info_20 = calculate_one_year_growth(soc_info_20, 'GDP_(US$)')
soc_20 = soc_info_20[soc_info_20['Year'] == 2020]

In [None]:
sht_25['Year'] = 2024
sht_25
soc_info_25 = socioeconomic_df
soc_info_25 = calculate_one_term_growth(soc_info_25, 'GNI (Per Capita US $)', 24)
soc_info_25 = calculate_one_year_growth(soc_info_25, 'GNI (Per Capita US $)')
soc_info_25 = calculate_one_term_growth(soc_info_25, 'Inflation', 24)
soc_info_25 = calculate_one_year_growth(soc_info_25, 'Inflation')
soc_info_25 = calculate_one_term_growth(soc_info_25, 'Unemployment', 24)
soc_info_25 = calculate_one_year_growth(soc_info_25, 'Unemployment')
soc_info_25 = calculate_one_term_growth(soc_info_25, 'Net Migration Rate', 24)
soc_info_25 = calculate_one_year_growth(soc_info_25, 'Net Migration Rate')
soc_info_25 = calculate_one_term_growth(soc_info_25, 'GDP (Per Capita US$)', 24)
soc_info_25 = calculate_one_year_growth(soc_info_25, 'GDP (Per Capita US$)')
soc_info_25 = calculate_one_term_growth(soc_info_25, 'GDP_(US$)', 24)
soc_info_25 = calculate_one_year_growth(soc_info_25, 'GDP_(US$)')
soc_25 = soc_info_25[soc_info_25['Year'] == 2024]
# soc_25

In [None]:
sht_07 = sht_07[['ELECTORAL_DISTRICT','ELECTORATE', 'P.N.M._VOTES_%', 'U.N.C._VOTES_%', 'VOTER_TURNOUT', 'TOTAL_NUMBER_OF_VOTES_CAST',
                'VALID_VOTES', 'Year']]
sht_07 = pd.get_dummies(sht_07, columns=['ELECTORAL_DISTRICT'], prefix='ELECTORAL_DISTRICT')
merged_07 = pd.merge(sht_07, soc_07, on='Year', how='inner')
# merged_07 = pd.merge(div_elec_df, merged_07, on='ELECTORAL_DISTRICT', how='inner')
# merged_07 = pd.merge(homicide_07, merged_07, on='Division', how='outer')

sht_10 = sht_10[['ELECTORAL_DISTRICT','ELECTORATE', 'P.N.M._VOTES_%', 'U.N.C._VOTES_%', 'VOTER_TURNOUT', 'TOTAL_NUMBER_OF_VOTES_CAST',
                'VALID_VOTES', 'Year']]
sht_10 = pd.get_dummies(sht_10, columns=['ELECTORAL_DISTRICT'], prefix='ELECTORAL_DISTRICT')
merged_10 = pd.merge(sht_10, soc_10, on='Year', how='inner')
# merged_10 = pd.merge(div_elec_df, merged_10, on='ELECTORAL_DISTRICT', how='inner')
# merged_10 = pd.merge(homicide_10, merged_10, on='Division', how='outer')

sht_15 = sht_15[['ELECTORAL_DISTRICT','ELECTORATE', 'P.N.M._VOTES_%', 'U.N.C._VOTES_%', 'VOTER_TURNOUT', 'TOTAL_NUMBER_OF_VOTES_CAST',
                'VALID_VOTES', 'Year']]
sht_15 = pd.get_dummies(sht_15, columns=['ELECTORAL_DISTRICT'], prefix='ELECTORAL_DISTRICT')
merged_15 = pd.merge(sht_15, soc_15, on='Year', how='inner')
# merged_15 = pd.merge(div_elec_df, merged_15, on='ELECTORAL_DISTRICT', how='inner')
# merged_15 = pd.merge(homicide_15, merged_15, on='Division', how='outer')

sht_20 = sht_20[['ELECTORAL_DISTRICT','ELECTORATE', 'P.N.M._VOTES_%', 'U.N.C._VOTES_%', 'VOTER_TURNOUT', 'TOTAL_NUMBER_OF_VOTES_CAST',
                'VALID_VOTES', 'Year']]
sht_20 = pd.get_dummies(sht_20, columns=['ELECTORAL_DISTRICT'], prefix='ELECTORAL_DISTRICT')
merged_20 = pd.merge(sht_20, soc_20, on='Year', how='inner')
# merged_20 = pd.merge(div_elec_df, merged_20, on='ELECTORAL_DISTRICT', how='inner')
# merged_20 = pd.merge(homicide_20, merged_20, on='Division', how='outer')

sht_25 = sht_25[['ELECTORAL_DISTRICT','ELECTORATE', 'P.N.M._VOTES_%', 'U.N.C._VOTES_%', 'VOTER_TURNOUT', 'TOTAL_NUMBER_OF_VOTES_CAST',
                'VALID_VOTES', 'Year']]
sht_25 = pd.get_dummies(sht_25, columns=['ELECTORAL_DISTRICT'], prefix='ELECTORAL_DISTRICT')
merged_25 = pd.merge(sht_25, soc_25, on='Year', how='inner')
# merged_25 = pd.merge(div_elec_df, merged_25, on='ELECTORAL_DISTRICT', how='inner')
# merged_25 = pd.merge(homicide_25, merged_25, on='Division', how='outer')

master_df = pd.concat([merged_07, merged_10, merged_15, merged_20, merged_25], ignore_index=True)
master_df['Growth Rate'] = master_df['Growth Rate'].str.rstrip('%').astype('float')
master_df.fillna(0, inplace=True)

In [None]:
master_df.to_csv(r"csvs/master_df.csv", index=False)
master_df.info() #growth rate is an object, need to convert to float

In [None]:
print(soc_info_07.columns.tolist()) #lazy to type out all the columns

## Correlation Matrix

- The below Correlation Matrix graphically illustrates the relationship between key economic indicators and voting behiour

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filter data to include only rows where 'Year' is less than or equal to 2020
filtered_df = soc_info_07[soc_info_07['Year'] <= 2020]

# Select only numeric features
numeric_df = filtered_df.select_dtypes(include='number')
numeric_df = numeric_df.dropna(axis=1)

# Compute correlation matrix
corr_matrix = numeric_df.corr()

# Replace NaN or infinite values in the correlation matrix
corr_matrix = corr_matrix.fillna(0)  # Replace NaN with 0
corr_matrix = corr_matrix.replace([float('inf'), -float('inf')], 0)  # Replace infinite values with 0

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.clustermap(corr_matrix, annot=True, cmap='coolwarm', figsize=(10, 10), linewidths=0.5)
plt.suptitle("Clustered Correlation Matrix", y=1.02)
plt.show()
numeric_df

In [None]:
constituency_list = [
    'ARIMA', 'AROUCA/MALONEY', 'BARATARIA/SAN JUAN', 'CARONI CENTRAL', 'CARONI EAST',
    'CHAGUANAS EAST', 'CHAGUANAS WEST', 'COUVA NORTH', 'COUVA SOUTH', 'CUMUTO/MANZANILLA',
    "D'ABADIE/O'MEARA", 'DIEGO MARTIN CENTRAL', 'DIEGO MARTIN NORTH/EAST', 'DIEGO MARTIN WEST',
    'FYZABAD', 'LA BREA', 'LA HORQUETTA/TALPARO', 'LAVENTILLE EAST/MORVANT', 'LAVENTILLE WEST',
    'LOPINOT/BON AIR WEST', 'MAYARO', 'MORUGA/TABLELAND', 'NAPARIMA', 'OROPOUCHE EAST',
    'OROPOUCHE WEST', 'POINT FORTIN', 'POINTE-A-PIERRE', "PORT-OF-SPAIN NORTH/ST. ANN'S WEST",
    'PORT-OF-SPAIN SOUTH', 'PRINCES TOWN', 'SAN FERNANDO EAST', 'SAN FERNANDO WEST', 'SIPARIA',
    "ST. ANN'S EAST", 'ST. AUGUSTINE', 'ST. JOSEPH', 'TABAQUITE', 'TOBAGO EAST', 'TOBAGO WEST',
    'TOCO/SANGRE GRANDE', 'TUNAPUNA', 'TOTAL'
]

import numpy as np

# Repeat the constituency list
repeated_constituencies = constituency_list * 5  # because you said repeat 5 times

master_df['CONSTITUENCIES'] = repeated_constituencies

cols = ['CONSTITUENCIES'] + [col for col in master_df.columns if col != 'CONSTITUENCIES']
master_df = master_df[cols]
master_df

## Model Training

### Multinomial Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

df = master_df.copy()
df = df[df['Year'] == 2020]
constituencies = [
    "ELECTORAL_DISTRICT_ARIMA", "ELECTORAL_DISTRICT_AROUCA/MALONEY", "ELECTORAL_DISTRICT_BARATARIA/SAN JUAN", "ELECTORAL_DISTRICT_CARONI CENTRAL", "ELECTORAL_DISTRICT_CARONI EAST",
    "ELECTORAL_DISTRICT_CHAGUANAS EAST", "ELECTORAL_DISTRICT_CHAGUANAS WEST", "ELECTORAL_DISTRICT_COUVA NORTH", "ELECTORAL_DISTRICT_COUVA SOUTH", "ELECTORAL_DISTRICT_CUMUTO/MANZANILLA",
    "ELECTORAL_DISTRICT_D'ABADIE/O'MEARA", "ELECTORAL_DISTRICT_DIEGO MARTIN CENTRAL", "ELECTORAL_DISTRICT_DIEGO MARTIN NORTH/EAST", "ELECTORAL_DISTRICT_DIEGO MARTIN WEST",
    "ELECTORAL_DISTRICT_FYZABAD", "ELECTORAL_DISTRICT_LA BREA", "ELECTORAL_DISTRICT_LA HORQUETTA/TALPARO", "ELECTORAL_DISTRICT_LAVENTILLE EAST/MORVANT", "ELECTORAL_DISTRICT_LAVENTILLE WEST",
    "ELECTORAL_DISTRICT_LOPINOT/BON AIR WEST", "ELECTORAL_DISTRICT_MAYARO", "ELECTORAL_DISTRICT_MORUGA/TABLELAND", "ELECTORAL_DISTRICT_NAPARIMA", "ELECTORAL_DISTRICT_OROPOUCHE EAST",
    "ELECTORAL_DISTRICT_OROPOUCHE WEST", "ELECTORAL_DISTRICT_POINT FORTIN", "ELECTORAL_DISTRICT_POINTE-A-PIERRE", "ELECTORAL_DISTRICT_PORT-OF-SPAIN NORTH/ST. ANN'S WEST",
    "PORT-OF-SPAIN SOUTH", "PRINCES TOWN", "SAN FERNANDO EAST", "SAN FERNANDO WEST", "SIPARIA","ELECTORAL_DISTRICT_ST. ANN'S EAST", "ELECTORAL_DISTRICT_ST. AUGUSTINE", 
    "ELECTORAL_DISTRICT_ST. JOSEPH", "ELECTORAL_DISTRICT_TABAQUITE", "ELECTORAL_DISTRICT_TOBAGO EAST", "ELECTORAL_DISTRICT_TOBAGO WEST", "ELECTORAL_DISTRICT_TOCO/SANGRE GRANDE", 
    "ELECTORAL_DISTRICT_TUNAPUNA", "ELECTORAL_DISTRICT_TOTAL"
]

# Step 2: Get indexes where Year == 2024
df_2025 = master_df[master_df['Year'] != 2024]
indexes_2025 = df_2025.index.tolist()

# Step 3: Make key-value pairs
index_constituency_mapping = dict(zip(indexes_2025, constituencies))

# Add a column for the winning party
df['Winning_Party'] = df[['P.N.M._VOTES_%', 'U.N.C._VOTES_%']].idxmax(axis=1)

# Add a column for the margin of victory
df['Margin_of_Victory'] = df[['P.N.M._VOTES_%', 'U.N.C._VOTES_%']].apply(
    lambda row: row.max() - sorted(row)[-2], axis=1
)

# Define thresholds
safe_margin_threshold = 0.20    # Safe seat if margin of victory > 20%

# Identify safe seats
safe_seats = df[(df['Margin_of_Victory'] > safe_margin_threshold)]

safe_seat_columns = safe_seats.loc[:, safe_seats.columns.str.startswith('ELECTORAL_DISTRICT_')]
unique_safe_seats = safe_seat_columns.columns[(safe_seat_columns == True).any()].tolist()

# Identify marginal seats
marginal_seats = df[df['Margin_of_Victory'] < safe_margin_threshold]
marginal_seat_columns = marginal_seats.loc[:, marginal_seats.columns.str.startswith('ELECTORAL_DISTRICT_')]

unique_marginal_seats = marginal_seat_columns.columns[(marginal_seat_columns == True).any()].tolist()
seat_columns = marginal_seat_columns.columns[(marginal_seat_columns == True).any()]

# Create a dictionary where key = index, value = seat name
seat_index_mapping = {}

for seat in seat_columns:
    # Find the index (row) where the seat is True
    matching_indexes = marginal_seat_columns.index[marginal_seat_columns[seat] == True].tolist()
    for idx in matching_indexes:
        seat_index_mapping[idx] = seat

unique_safe_seats = [seat for seat in unique_safe_seats if seat not in unique_marginal_seats]
unique_marginal_seats.remove('ELECTORAL_DISTRICT_TOTAL')

print("Number of Safe Seats:", len(unique_safe_seats))
print("Number of Marginal Seats:", len(unique_marginal_seats))

seat_mapping_marginal = {}
seat_mapping_safe = {}

for index, constituency in index_constituency_mapping.items():
    if constituency in unique_safe_seats:
        seat_mapping_safe[index] = constituency
    elif constituency in unique_marginal_seats:
        seat_mapping_marginal[index] = constituency

seat_mapping_safe, seat_mapping_marginal

In [None]:
seat_mapping_safe_df = pd.DataFrame(list(seat_mapping_safe.items()), columns=['Index', 'Constituency_Safe'])
seat_mapping_marginal_df = pd.DataFrame(list(seat_mapping_marginal.items()), columns=['Index', 'Constituency_Marginal'])

seat_mapping_safe_df['Constituency_Safe'] = seat_mapping_safe_df['Constituency_Safe'].str.replace('ELECTORAL_DISTRICT_', '', regex=False)
seat_mapping_marginal_df['Constituency_Marginal'] = seat_mapping_marginal_df['Constituency_Marginal'].str.replace('ELECTORAL_DISTRICT_', '', regex=False)

# Join with master_df for safe seats
df_safe_seats = master_df.merge(seat_mapping_safe_df, left_on='CONSTITUENCIES', right_on='Constituency_Safe', how='left')
df_marginal_seats = master_df.merge(seat_mapping_marginal_df, left_on='CONSTITUENCIES', right_on='Constituency_Marginal', how='left')

df_safe_seats = df_safe_seats[df_safe_seats['Constituency_Safe'].notna()]
df_marginal_seats = df_marginal_seats[df_marginal_seats['Constituency_Marginal'].notna()]

df_safe_seats['Winning_Party'] = df_safe_seats[['P.N.M._VOTES_%', 'U.N.C._VOTES_%']].idxmax(axis=1)
df_marginal_seats['Winning_Party'] = df_marginal_seats[['P.N.M._VOTES_%', 'U.N.C._VOTES_%']].idxmax(axis=1)
# Show the resulting DataFrames for safe and marginal seats



In [None]:
leans_pnm = [
    "ARIMA", "AROUCA/MALONEY", "D'ABADIE/O'MEARA", "DIEGO MARTIN CENTRAL",
    "DIEGO MARTIN NORTH/EAST", "DIEGO MARTIN WEST", "LA BREA",
    "LAVENTILLE EAST/MORVANT", "LAVENTILLE WEST", "LOPINOT/BON AIR WEST",
    "POINT FORTIN", "PORT-OF-SPAIN NORTH/ST. ANN'S WEST", "PORT-OF-SPAIN SOUTH",
    "SAN FERNANDO EAST", "ST. ANN'S EAST", "TOBAGO WEST"
]

leans_unc = [
    "CARONI CENTRAL", "CARONI EAST", "CHAGUANAS WEST", "COUVA NORTH",
    "COUVA SOUTH", "FYZABAD", "NAPARIMA", "OROPOUCHE EAST",
    "OROPOUCHE WEST", "PRINCES TOWN", "SIPARIA", "ST. AUGUSTINE", "TABAQUITE"
]

battleground = [
    "BARATARIA/SAN JUAN", "CHAGUANAS EAST", "CUMUTO/MANZANILLA",
    "LA HORQUETTA/TALPARO", "MAYARO", "MORUGA/TABLELAND", "POINTE-A-PIERRE",
    "SAN FERNANDO WEST", "ST. JOSEPH", "TOBAGO EAST", "TOCO/SANGRE GRANDE",
    "TUNAPUNA"
]

# Make the mapping dictionary
safe_constituency_leaning = {}
marginal_constituency_leaning = {}
unknown_constituency_leaning = {}

for seat in master_df['CONSTITUENCIES']:
    if seat in leans_pnm:
        safe_constituency_leaning[seat] = 2  # Leans PNM
    elif seat in leans_unc:
        safe_constituency_leaning[seat] = 1  # Leans UNC
    elif seat in battleground:
        marginal_constituency_leaning[seat] = 3  # Battleground seat
    else:
        unknown_constituency_leaning[seat] = None  # Unknown

df_safe_seats['CONSTITUENCY_LEANING'] = df_safe_seats['CONSTITUENCIES'].map(safe_constituency_leaning)
df_marginal_seats['CONSTITUENCY_LEANING'] = df_marginal_seats['CONSTITUENCIES'].map(marginal_constituency_leaning)
master_df['CONSTITUENCY_LEANING'] = master_df['CONSTITUENCIES'].map(safe_constituency_leaning).fillna(
    master_df['CONSTITUENCIES'].map(marginal_constituency_leaning).fillna(
        master_df['CONSTITUENCIES'].map(unknown_constituency_leaning)
    )
)
master_df

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

features = [
            'CONSTITUENCY_LEANING','Year', 'Homicide_rate_per_100k', 'Annual (Percentage Change)', 'Inflation', 'Net Migration Rate', 'Inflation_1t', 'Inflation_1y', 'Unemployment_1t',
            'Unemployment_1y', 'Net Migration Rate_1t', 'Net Migration Rate_1y', 'GDP (Per Capita US$)_1t', 'GDP (Per Capita US$)_1y', 'GDP_(US$)_1t', 'GDP_(US$)_1y','ELECTORATE'
            ] 

df_safe_seats_model = df_safe_seats[df_safe_seats['Year'] != 2024]
print(df_safe_seats_model['Winning_Party'].unique())
print(len(df_safe_seats_model))

# Prepare data for classification
X_safe = df_safe_seats_model[features]
y_safe = df_safe_seats_model['Winning_Party']

df_safe_seats_model

# Make it binary: 1 if PNM, 0 otherwise
y_safe_binary = (y_safe == 'P.N.M._VOTES_%').astype(int)

# Split the data
X_safe_train, X_safe_test, y_safe_train, y_safe_test = train_test_split(
    X_safe, y_safe_binary, 
    test_size=0.4, 
    random_state=42, 
    stratify=y_safe_binary
)

# Fit a classification model
clf = LogisticRegression(solver='saga', max_iter=1000)
clf.fit(X_safe_train, y_safe_train)

# Make predictions
y_safe_pred = clf.predict(X_safe_test)

# Evaluate the classification model
print("Classification Report for Safe Seats:\n")
print(classification_report(y_safe_test, y_safe_pred))

print("\nConfusion Matrix for Safe Seats:\n")
print(confusion_matrix(y_safe_test, y_safe_pred))
print(y_safe_binary.value_counts())

In [None]:
features = ['CONSTITUENCY_LEANING','Year', 'Homicide_rate_per_100k', 'Annual (Percentage Change)', 'Inflation', 'Net Migration Rate', 'Inflation_1t', 'Inflation_1y', 'Unemployment_1t',
            'Unemployment_1y', 'Net Migration Rate_1t', 'Net Migration Rate_1y', 'GDP (Per Capita US$)_1t', 'GDP (Per Capita US$)_1y', 'GDP_(US$)_1t', 'GDP_(US$)_1y',
            'ELECTORATE'
            ] 

df_safe_seats_model = df_safe_seats[df_safe_seats['Year'] != 2024]
print(len(df_safe_seats_model))

# Prepare data for classification
X_safe = df_safe_seats_model[features]
y_safe = df_safe_seats_model['Winning_Party']

# Make it binary: 1 if PNM, 0 otherwise
y_safe_binary = (y_safe == 'P.N.M._VOTES_%').astype(int)

# Split the data
X_safe_train, X_safe_test, y_safe_train, y_safe_test = train_test_split(
    X_safe, y_safe_binary, 
    test_size=0.4, 
    random_state=42, 
    stratify=y_safe_binary
)

# Fit a classification model
clf = LogisticRegression(solver='saga', max_iter=1000)
clf.fit(X_safe_train, y_safe_train)

# Make predictions
y_safe_pred = clf.predict(X_safe_test)

# Evaluate the classification model
print("Classification Report for Safe Seats:\n")
print(classification_report(y_safe_test, y_safe_pred))

print("\nConfusion Matrix for Safe Seats:\n")
print(confusion_matrix(y_safe_test, y_safe_pred))
print(y_safe_binary.value_counts())

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Define the hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear', 'saga'],  # Solvers for Logistic Regression
    'max_iter': [100, 500, 1000],  # Number of iterations
    'class_weight': ['balanced', None]  # Handling class imbalance
}

# Initialize the logistic regression model
logreg = LogisticRegression(multi_class='ovr', max_iter=1000)

# Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search to the training data
grid_search.fit(X_safe_train, y_safe_train)

# Get the best hyperparameters
print("Best hyperparameters found: ", grid_search.best_params_)

# Get the best model from grid search
best_logreg = grid_search.best_estimator_

# Make predictions using the best model
y_safe_pred = best_logreg.predict(X_safe_test)

# Evaluate the model with a classification report and confusion matrix
print("Classification Report for Safe Seats with Tuning:\n")
print(classification_report(y_safe_test, y_safe_pred))

print("\nConfusion Matrix for Safe Seats with Tuning:\n")
print(confusion_matrix(y_safe_test, y_safe_pred))

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the same feature columns as used in training
features = ['CONSTITUENCY_LEANING', 'Year', 'Homicide_rate_per_100k', 'Annual (Percentage Change)', 'Inflation', 'Net Migration Rate',
            'Inflation_1t', 'Inflation_1y', 'Unemployment_1t', 'Unemployment_1y',
            'Net Migration Rate_1t', 'Net Migration Rate_1y', 'GDP (Per Capita US$)_1t',
            'GDP (Per Capita US$)_1y', 'GDP_(US$)_1t', 'GDP_(US$)_1y', 'ELECTORATE']

# Filter safe_seats to only include rows for 2025
safe_seats_2025_filtered = master_df[master_df['Year'] == 2024]
safe_seats_2025_filtered = safe_seats_2025_filtered[(safe_seats_2025_filtered['CONSTITUENCY_LEANING'] == 1) | (safe_seats_2025_filtered['CONSTITUENCY_LEANING'] == 2)]

# Show the filtered DataFrame
print(f"Rows after filtering for non-NaN 'Constituency_Leaning': {safe_seats_2025_filtered.shape[0]}")

# Select the same features for prediction
X_2025 = safe_seats_2025_filtered[features].copy()


# Scale using a scaler **fitted on X_safe**
scaler = StandardScaler()
X_safe_scaled = scaler.fit_transform(X_safe)        # fit on training data
X_2025_scaled = scaler.transform(X_2025)            # transform test (2025) data

# Use the best model from GridSearchCV or RandomizedSearchCV
best_model = grid_search.best_estimator_  # Use the model from GridSearchCV

# Predict class (0 or 1) using the tuned model
safe_seat_predictions = best_model.predict(X_2025_scaled)

# Optional: decode to party names
party_mapping = {1: 'P.N.M.', 0: 'U.N.C.'}
decoded_safe_predictions = [party_mapping[pred] for pred in safe_seat_predictions]

# Put predictions in a DataFrame
predictions_2025 = pd.DataFrame({
    'Constituency': safe_seats_2025_filtered['CONSTITUENCIES'],  # assuming index is constituency name
    'Predicted_Winning_Party': decoded_safe_predictions
})

# View the predictions
print(y_safe_train.value_counts())
print("Model Coefficients:\n", clf.coef_)
y_safe_train_pred = clf.predict(X_safe_train)
print("Training Accuracy: ", accuracy_score(y_safe_train, y_safe_train_pred))

# predictions_2025
predictions_2025


### Multinomial Linear Regression

##### PNM

In [None]:
# 1. Features
df_marginal_seats_model = df_marginal_seats[df_marginal_seats['Year'] != 2024]
X_marginal = df_marginal_seats_model[features]

# 2. Target (both UNC and PNM)
y_marginal = df_marginal_seats_model[['U.N.C._VOTES_%', 'P.N.M._VOTES_%']]

# 3. Train-test split
X_marginal_train, X_marginal_test, y_marginal_train, y_marginal_test = train_test_split(X_marginal, y_marginal, test_size=0.35, random_state=42)

# 4. Train the model
reg = LinearRegression()
reg.fit(X_marginal_train, y_marginal_train)

# 5. Predict for test set
y_marginal_pred_all = reg.predict(X_marginal)

# Flatten the arrays for actual and predicted values (for the entire dataset)
actual_unc_all = y_marginal['U.N.C._VOTES_%'].values
predicted_unc_all = y_marginal_pred_all[:, 0]  # UNC predictions (first column)

actual_pnm_all = y_marginal['P.N.M._VOTES_%'].values
predicted_pnm_all = y_marginal_pred_all[:, 1]  # PNM predictions (second column)

# 6. Evaluate the performance (MSE and R^2)
mse_unc_all = mean_squared_error(actual_unc_all, predicted_unc_all)
r2_unc_all = r2_score(actual_unc_all, predicted_unc_all)

mse_pnm_all = mean_squared_error(actual_pnm_all, predicted_pnm_all)
r2_pnm_all = r2_score(actual_pnm_all, predicted_pnm_all)

print(f"Mean Squared Error (UNC) on entire dataset: {mse_unc_all}")
print(f"R-squared (UNC) on entire dataset: {r2_unc_all}")

print(f"Mean Squared Error (PNM) on entire dataset: {mse_pnm_all}")
print(f"R-squared (PNM) on entire dataset: {r2_pnm_all}")

# 6. Create a DataFrame with the results
marginal_results_df_all = pd.DataFrame({
    'Constituency': df_marginal_seats_model['CONSTITUENCIES'],
    'Actual_UNC_Vote_Share': actual_unc_all,
    'Predicted_UNC_Vote_Share': predicted_unc_all,
    'Actual_PNM_Vote_Share': actual_pnm_all,
    'Predicted_PNM_Vote_Share': predicted_pnm_all
})

print("\nMarginal Seats Regression Results:\n")
marginal_results_df_all

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create a pipeline with feature scaling (StandardScaler) and Ridge regression
model = Ridge()

# Define the expanded parameter grid for hyperparameter tuning
param_grid = {
    'alpha': [0.1, 1, 10, 100],  # Regularization strength for Ridge
}

# Set up GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model using the training data
grid_search.fit(X_marginal_train, y_marginal_train)

# Get the best parameters and the best model
print(f"Best parameters found: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_marginal_pred_tuned = best_model.predict(X_marginal)

# Flatten the arrays for actual and predicted values (separating UNC and PNM predictions)
actual_unc = y_marginal['U.N.C._VOTES_%'].values
predicted_unc = y_marginal_pred_tuned[:, 0]  # UNC predictions (first column)

actual_pnm = y_marginal['P.N.M._VOTES_%'].values
predicted_pnm = y_marginal_pred_tuned[:, 1]  # PNM predictions (second column)

# Compute the MSE and R² for both UNC and PNM
mse_unc = mean_squared_error(actual_unc, predicted_unc)
r2_unc = r2_score(actual_unc, predicted_unc)

mse_pnm = mean_squared_error(actual_pnm, predicted_pnm)
r2_pnm = r2_score(actual_pnm, predicted_pnm)

print(f"Mean Squared Error (MSE) for UNC predictions: {mse_unc}")
print(f"R-squared (R²) for UNC predictions: {r2_unc}")

print(f"Mean Squared Error (MSE) for PNM predictions: {mse_pnm}")
print(f"R-squared (R²) for PNM predictions: {r2_pnm}")

# Create a DataFrame to compare actual and predicted values for both parties
marginal_results_df_tuned = pd.DataFrame({
    'Constituency': df_marginal_seats_model['CONSTITUENCIES'],
    'Actual_UNC_Vote_Share': actual_unc,
    'Predicted_UNC_Vote_Share': predicted_unc,
    'Actual_PNM_Vote_Share': actual_pnm,
    'Predicted_PNM_Vote_Share': predicted_pnm
})

# Display the results
print("\nTuned Model Marginal Seats Regression Results:\n")
marginal_results_df_tuned

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Mean Squared Error for both UNC and PNM
mse_unc = mean_squared_error(actual_unc, predicted_unc)
mse_pnm = mean_squared_error(actual_pnm, predicted_pnm)

# 2. R-squared (R²) for both UNC and PNM
r2_unc = r2_score(actual_unc, predicted_unc)
r2_pnm = r2_score(actual_pnm, predicted_pnm)

# 3. Residuals for analysis
residuals_unc = actual_unc - predicted_unc
residuals_pnm = actual_pnm - predicted_pnm

# Plotting
plt.figure(figsize=(14, 6))

# 4. Residual plot for UNC
plt.subplot(1, 2, 1)
sns.scatterplot(x=predicted_unc, y=residuals_unc, color='blue')
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residuals for UNC Vote Share')
plt.xlabel('Predicted UNC Vote Share')
plt.ylabel('Residuals')

# 5. Residual plot for PNM
plt.subplot(1, 2, 2)
sns.scatterplot(x=predicted_pnm, y=residuals_pnm, color='green')
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residuals for PNM Vote Share')
plt.xlabel('Predicted PNM Vote Share')
plt.ylabel('Residuals')

# Show plots
plt.tight_layout()
plt.show()

# Print the evaluation metrics
print(f"Mean Squared Error (MSE) for UNC: {mse_unc}")
print(f"Mean Squared Error (MSE) for PNM: {mse_pnm}")
print(f"R-squared (R²) for UNC: {r2_unc}")
print(f"R-squared (R²) for PNM: {r2_pnm}")

In [None]:
# 1. Determine the predicted winner based on the predicted vote shares
marginal_results_df_tuned['Predicted_Winner'] = np.where(
    marginal_results_df_tuned['Predicted_UNC_Vote_Share'] > marginal_results_df_tuned['Predicted_PNM_Vote_Share'], 
    'UNC',  # UNC wins if its actual vote share is greater
    'PNM'   # PNM wins if its actual vote share is greater
)

# 2. Determine the actual winner based on the actual vote shares
marginal_results_df_tuned['Actual_Winner'] = np.where(
    marginal_results_df_tuned['Actual_UNC_Vote_Share'] > marginal_results_df_tuned['Actual_PNM_Vote_Share'], 
    'UNC',  # UNC wins if its actual vote share is greater
    'PNM'   # PNM wins if its actual vote share is greater
)

# 3. Print the predicted and actual winners for each constituency
print(marginal_results_df_tuned[['Constituency', 'Actual_Winner', 'Predicted_Winner']])

In [None]:
# 1. Determine the predicted winner based on the predicted vote shares
marginal_results_df_all['Predicted_Winner'] = np.where(
    marginal_results_df_all['Predicted_UNC_Vote_Share'] > marginal_results_df_all['Predicted_PNM_Vote_Share'], 
    'UNC',  # UNC wins if its actual vote share is greater
    'PNM'   # PNM wins if its actual vote share is greater
)

# 2. Determine the actual winner based on the actual vote shares
marginal_results_df_all['Actual_Winner'] = np.where(
    marginal_results_df_all['Actual_UNC_Vote_Share'] > marginal_results_df_all['Actual_PNM_Vote_Share'], 
    'UNC',  # UNC wins if its actual vote share is greater
    'PNM'   # PNM wins if its actual vote share is greater
)

# 3. Print the predicted and actual winners for each constituency
print(marginal_results_df_all[['Constituency', 'Actual_Winner', 'Predicted_Winner']])

In [None]:
normal_model_accuracy = (marginal_results_df_all['Predicted_Winner'] == marginal_results_df_all['Actual_Winner']).mean()
print(f"Normal Model Accuracy: {normal_model_accuracy:.2%}")

tuned_model_accuracy = (marginal_results_df_tuned['Predicted_Winner'] == marginal_results_df_tuned['Actual_Winner']).mean()
print(f"Tuned Model Accuracy: {tuned_model_accuracy:.2%}")

In [None]:
# Filter 2025 data
X_marginal_2025 = df_marginal_seats[df_marginal_seats['Year'] == 2024][features]
# Predict vote shares for 2025
y_marginal_2025_pred = best_model.predict(X_marginal_2025)

# Split predictions into UNC and PNM predictions
predicted_unc_2025 = y_marginal_2025_pred[:, 0]
predicted_pnm_2025 = y_marginal_2025_pred[:, 1]

# Create DataFrame
marginal_2025_results_df = pd.DataFrame({
    'Constituency': df_marginal_seats[df_marginal_seats['Year'] == 2024]['CONSTITUENCIES'],
    'Predicted_UNC_Vote_Share': predicted_unc_2025,
    'Predicted_PNM_Vote_Share': predicted_pnm_2025
})

print(marginal_2025_results_df)


In [None]:
# Prepare data for regression

df_marginal_seats_model = df_marginal_seats[df_marginal_seats['Year'] != 2024]
df_marginal_seats_model = df_marginal_seats_model[df_marginal_seats_model['CONSTITUENCY_LEANING'] == 3]

X_marginal = df_marginal_seats_model[features]
y_marginal = df_marginal_seats_model['P.N.M._VOTES_%']

# Split the data
X_marginal_train, X_marginal_test, y_marginal_train, y_marginal_test = train_test_split(X_marginal, y_marginal, test_size=0.3, random_state=42)

# Fit a regression model
reg = LinearRegression()
reg.fit(X_marginal_train, y_marginal_train)

# Make predictions
y_marginal_pred = reg.predict(X_marginal_test)

# Evaluate the regression model
print("Mean Squared Error for Marginal Seats: ", mean_squared_error(y_marginal_test, y_marginal_pred))
print("R-squared for Marginal Seats: ", r2_score(y_marginal_test, y_marginal_pred))

# Create a DataFrame to compare actual and predicted values
marginal_results_df = pd.DataFrame({
    'Actual_Vote_Share': y_marginal_test,
    'Predicted_Vote_Share': y_marginal_pred
})

print("\nMarginal Seats Regression Results:\n")
print(marginal_results_df)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Define the model
ridge = Ridge()

# Define the parameter grid for tuning
param_grid = {
    'alpha': [0.1, 1, 10, 100],  # Regularization strength
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV to the data
grid_search.fit(X_marginal_train, y_marginal_train)

# Get the best hyperparameters from the grid search
print(f"Best hyperparameters: {grid_search.best_params_}")

# Get the best model from the grid search
best_ridge = grid_search.best_estimator_

# Make predictions using the best model
y_marginal_pred = best_ridge.predict(X_marginal_test)

# Evaluate the tuned model
print("Mean Squared Error for Marginal Seats (Tuned): ", mean_squared_error(y_marginal_test, y_marginal_pred))
print("R-squared for Marginal Seats (Tuned): ", r2_score(y_marginal_test, y_marginal_pred))

# Create a DataFrame to compare actual and predicted values
marginal_results_df = pd.DataFrame({
    'Actual_Vote_Share': y_marginal_test,
    'Predicted_Vote_Share': y_marginal_pred
})

print("\nMarginal Seats Regression Results (Tuned):\n")
print(marginal_results_df)

In [None]:
# Filter the data for 2025
df_marginal_seats_2025 = master_df[(master_df['Year'] == 2024) & (master_df['CONSTITUENCIES'] != 'TOTAL') & (master_df['CONSTITUENCY_LEANING'] == 3)]

# Ensure we only use the same features that were used in training
X_marginal_2025 = df_marginal_seats_2025[features]

# Predict the vote share for the 2025 marginal seats
y_marginal_2025_pred = reg.predict(X_marginal_2025)

# Create a DataFrame to compare actual and predicted values for 2025
marginal_2025_results_df_pnm = pd.DataFrame({
    'Constituency': df_marginal_seats_2025['CONSTITUENCIES'],
    'Actual_Vote_Share': df_marginal_seats_2025['P.N.M._VOTES_%'],  # Actual values, if available
    'Predicted_Vote_Share': y_marginal_2025_pred
})

# Show the results
print("\n2025 PNM Marginal Seats Prediction Results:\n")
print(marginal_2025_results_df_pnm)

##### UNC

In [None]:
# Prepare data for regression
df_marginal_seats_model = df_marginal_seats[df_marginal_seats['Year'] != 2024]
X_marginal = df_marginal_seats_model[features]
y_marginal = df_marginal_seats_model['U.N.C._VOTES_%']

# Split the data
X_marginal_train, X_marginal_test, y_marginal_train, y_marginal_test = train_test_split(X_marginal, y_marginal, test_size=0.3, random_state=42)

# Fit a regression model
reg = LinearRegression()
reg.fit(X_marginal_train, y_marginal_train)

# Make predictions
y_marginal_pred = reg.predict(X_marginal_test)

# Evaluate the regression model
print("Mean Squared Error for Marginal Seats: ", mean_squared_error(y_marginal_test, y_marginal_pred))
print("R-squared for Marginal Seats: ", r2_score(y_marginal_test, y_marginal_pred))

# Create a DataFrame to compare actual and predicted values
marginal_results_df = pd.DataFrame({
    'Actual_Vote_Share': y_marginal_test,
    'Predicted_Vote_Share': y_marginal_pred
})

print("\nMarginal Seats Regression Results:\n")
print(marginal_results_df)

In [None]:
# from sklearn.linear_model import Ridge
# from sklearn.model_selection import GridSearchCV

# # Define the model
# ridge = Ridge()

# # Define the parameter grid for tuning
# param_grid = {
#     'alpha': [0.1, 1, 10, 100],  # Regularization strength
# }

# # Initialize GridSearchCV
# grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# # Fit GridSearchCV to the data
# grid_search.fit(X_marginal_train, y_marginal_train)

# # Get the best hyperparameters from the grid search
# print(f"Best hyperparameters: {grid_search.best_params_}")

# # Get the best model from the grid search
# best_ridge = grid_search.best_estimator_

# # Make predictions using the best model
# y_marginal_pred = best_ridge.predict(X_marginal_test)

# # Evaluate the tuned model
# print("Mean Squared Error for Marginal Seats (Tuned): ", mean_squared_error(y_marginal_test, y_marginal_pred))
# print("R-squared for Marginal Seats (Tuned): ", r2_score(y_marginal_test, y_marginal_pred))

# # Create a DataFrame to compare actual and predicted values
# marginal_results_df = pd.DataFrame({
#     'Actual_Vote_Share': y_marginal_test,
#     'Predicted_Vote_Share': y_marginal_pred
# })

# print("\nMarginal Seats Regression Results (Tuned):\n")
# print(marginal_results_df)

In [None]:
# Filter the data for 2025
df_marginal_seats_2025 = master_df[(master_df['Year'] == 2024) & (master_df['CONSTITUENCIES'] != 'TOTAL') & (master_df['CONSTITUENCY_LEANING'] == 3)]

# Ensure we only use the same features that were used in training
X_marginal_2025 = df_marginal_seats_2025[features]

# Predict the vote share for the 2025 marginal seats
y_marginal_2025_pred = reg.predict(X_marginal_2025)

# Create a DataFrame to compare actual and predicted values for 2025
marginal_2025_results_df_unc = pd.DataFrame({
    'Constituency': df_marginal_seats_2025['CONSTITUENCIES'],
    'Actual_Vote_Share': df_marginal_seats_2025['U.N.C._VOTES_%'],  # Actual values, if available
    'Predicted_Vote_Share': y_marginal_2025_pred
})

# Show the results
print("\n2025 UNC Marginal Seats Prediction Results:\n")
print(marginal_2025_results_df_unc)

In [None]:
import numpy as np

def recalibrate_predictions(predictions, k=5, center=0.5):
    """
    Apply a sigmoid recalibration to prevent extreme vote share predictions.
    
    Args:
        predictions (array-like): Raw model predictions.
        k (float): Steepness of the sigmoid. Higher = steeper.
        center (float): Center point around which to squash.
        
    Returns:
        array-like: Adjusted predictions.
    """
    return 1 / (1 + np.exp(-k * (predictions - center)))

y_marginal_2025_pred_recalibrated = recalibrate_predictions(y_marginal_2025_pred, k=5, center=0.5)

marginal_2025_results_df_unc_recalibrated = pd.DataFrame({
    'Constituency': df_marginal_seats_2025['CONSTITUENCIES'],
    'Actual_Vote_Share': df_marginal_seats_2025['U.N.C._VOTES_%'],
    'Predicted_Vote_Share_Recalibrated': y_marginal_2025_pred_recalibrated
})

marginal_2025_results_df_unc_recalibrated

In [None]:
# If Predicted PNM > Predicted UNC, PNM wins
marginal_2025_results_df = pd.merge(marginal_2025_results_df_unc_recalibrated, marginal_2025_results_df_pnm, on='Constituency')
marginal_2025_results_df
marginal_2025_results_df['Predicted_Winner'] = np.where(
    marginal_2025_results_df['Predicted_Vote_Share'] > marginal_2025_results_df['Predicted_Vote_Share_Recalibrated'], 
    'PNM', 
    'UNC'
)

print(marginal_2025_results_df[['Constituency', 'Predicted_Winner']])

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import classification_report, confusion_matrix
# import joblib

# # === 1. Define features and target ===
# features = [
#     'Year', 'Homicide_rate_per_100k', 'Annual (Percentage Change)', 'Inflation', 'Net Migration Rate',
#     'Inflation_1t', 'Inflation_1y', 'Unemployment_1t', 'Unemployment_1y',
#     'Net Migration Rate_1t', 'Net Migration Rate_1y',
#     'GDP (Per Capita US$)_1t', 'GDP (Per Capita US$)_1y', 'GDP_(US$)_1t', 'GDP_(US$)_1y'
# ]

# df_safe_seats_model = df_safe_seats[df_safe_seats['Year'] != 2024]

# X_safe = df_safe_seats_model[features]
# y_safe = df_safe_seats_model['Winning_Party']

# # Encode target
# y_safe_encoded = y_safe.astype('category').cat.codes

# # === 2. Split data ===
# X_safe_train, X_safe_test, y_safe_train, y_safe_test = train_test_split(
#     X_safe, y_safe_encoded, test_size=0.4, random_state=42, stratify=y_safe_encoded
# )

# # (Optional) Scale features — Random Forests don't strictly need scaling, but if you want consistency:
# # scaler = StandardScaler()
# # X_safe_train = scaler.fit_transform(X_safe_train)
# # X_safe_test = scaler.transform(X_safe_test)

# # === 3. Define Random Forest Model and GridSearch ===
# rf = RandomForestClassifier(random_state=42)

# param_grid_rf = {
#     'n_estimators': [50, 100, 200, 300, 500],
#     'max_depth': [None, 5, 10, 20, 30],
#     'min_samples_split': [2, 5, 10, 20],
#     'min_samples_leaf': [1, 2, 5, 10],
#     'max_features': ['sqrt', 'log2', 0.8, 0.5],
#     'class_weight': ['balanced', 'balanced_subsample', None]
# }

# grid_search_rf = GridSearchCV(
#     estimator=rf,
#     param_grid=param_grid_rf,
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )

# # === 4. Train model ===
# grid_search_rf.fit(X_safe_train, y_safe_train)

# print("Best hyperparameters found: ", grid_search_rf.best_params_)

# # === 5. Evaluate ===
# best_rf = grid_search_rf.best_estimator_

# y_safe_pred_rf = best_rf.predict(X_safe_test)

# print("\nClassification Report for Safe Seats (Random Forest):\n")
# print(classification_report(y_safe_test, y_safe_pred_rf))

# print("\nConfusion Matrix for Safe Seats (Random Forest):\n")
# print(confusion_matrix(y_safe_test, y_safe_pred_rf))

# # === 6. Save model if needed ===
# joblib.dump(best_rf, "random_forest_safe_seats_model.pkl")

In [None]:
# import pandas as pd
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()


# features = ['Year', 'Homicide_rate_per_100k', 'Annual (Percentage Change)', 'Inflation', 'Net Migration Rate', 'Inflation_1t', 'Inflation_1y', 
#             'Unemployment_1t','Unemployment_1y', 'Net Migration Rate_1t', 'Net Migration Rate_1y', 'GDP (Per Capita US$)_1t', 'GDP (Per Capita US$)_1y', 'GDP_(US$)_1t',
#             'GDP_(US$)_1y', 'VOTER_TURNOUT', 'VALID_VOTES'] 

# # print(features)

# X = master_df[features]
# print(X.columns.tolist())
# X_scaled = scaler.fit_transform(X)

# if 'VOTER_TURNOUT' not in socioeconomic_df.columns:
#     socioeconomic_df['VOTER_TURNOUT'] = 0
# if 'VALID_VOTES' not in socioeconomic_df.columns:
#     socioeconomic_df['VALID_VOTES'] = 0

# # Select features for 2025
# X_2025 = socioeconomic_df[socioeconomic_df['Year'] == 2025][features]



# # Ensure `X_2024` has the same features as the training data
# X_2025_aligned = pd.DataFrame(columns=features, index=X_2025.index)   # Create an empty DataFrame with training features
# X_2025_aligned.update(X_2025)  # Update with the values from `X_2025`
# X_2025_aligned.fillna(0, inplace=True)  # Fill missing features with 0


# # Reorder columns to match the training data
# X_2025_aligned = X_2025_aligned[features]

# # Scale the input data if scaling was applied during training
# X_2025_scaled = scaler.transform(X_2025_aligned)

# print(X_2025_scaled)

# # Predict the winning party for 2024
# winning_party_2025 = clf.predict(X_2025_scaled)

# # Decode the prediction if the target variable was encoded
# party_mapping = {0: 'P.N.M.', 1: 'U.N.C.'}  # Adjust based on your encoding
# decoded_predictions = [party_mapping[pred] for pred in winning_party_2025]

# # Count the number of seats won by each party
# seat_breakdown = pd.Series(decoded_predictions).value_counts()

# # Print the breakdown of seats
# print("Breakdown of Seats Won in 2025:")
# print(seat_breakdown)

### Support Vector Machine

##### Marginal Seats 

In [None]:
# from sklearn import metrics
# from sklearn.svm import SVR
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import StandardScaler


# df = safe_seats.copy()

# target = 'P.N.M._VOTES_%'  # P.N.M. vote share as the target

# X = df.drop(['P.N.M._VOTES_%', 'U.N.C._VOTES_%','Winning_Party', 'Margin_of_Victory'], axis=1)
# y = df[target]


# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# svr = SVR(kernel='linear') 
# svr.fit(X_train, y_train)

# y_pred = svr.predict(X_test)

# #mse for eval
# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error: {mse}")

# r2 = r2_score(y_test, y_pred)
# print(f"R^2 score: {r2}")

# results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# print(results.head())

# print(df[features].corrwith(df['P.N.M._VOTES_%']))
# print(df[features].corrwith(df['U.N.C._VOTES_%']))


Graphical plot of SVM results

In [None]:
# import matplotlib.pyplot as plt

# plt.scatter(y_test, y_pred)
# plt.xlabel("Actual")
# plt.ylabel("Predicted")
# plt.title("SVM Predictions")
# plt.plot([0, 1], [0, 1], color='red', linestyle='--')  # ideal predictions line
# plt.show()

### Support Vector Machine 

##### Safe Seats

In [None]:
# df = master_df.copy()

# target = 'U.N.C._VOTES_%'  # P.N.M. vote share as the target

# X = df.drop(['U.N.C._VOTES_%', 'P.N.M._VOTES_%'], axis=1)
# y = df[target]


# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# # initialize and train the Support Vector Regression model
# svr = SVR(kernel='linear')  # change kernel as needed
# svr.fit(X_train, y_train)

# y_pred = svr.predict(X_test)

# #mse for eval
# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error: {mse}")

# r2 = r2_score(y_test, y_pred)
# print(f"R² score: {r2}")

# results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# print(results.head())

Graphical plot of SVM results

In [None]:

# plt.scatter(y_test, y_pred)
# plt.xlabel("Actual")
# plt.ylabel("Predicted")
# plt.title("SVM Predictions")
# plt.plot([0, 1], [0, 1], color='red', linestyle='--')  # ideal predictions line
# plt.show()

### Decision Tree Regression and Classifiers w/o marginal

##### Decision Tree Regressor w/o marginal

##### PNM

In [None]:
# from sklearn.tree import DecisionTreeRegressor
# from sklearn import metrics

# decisions = master_df.copy()

# feat_cols = ['Year', 'Homicide_rate_per_100k', 'Annual (Percentage Change)', 'Inflation', 'Net Migration Rate', 'GNI (Per Capita US $)_1t', 'GNI (Per Capita US $)_1y', 'Inflation_1t', 'Inflation_1y', 
#             'Unemployment_1t','Unemployment_1y', 'Net Migration Rate_1t', 'Net Migration Rate_1y', 'GDP (Per Capita US$)_1t', 'GDP (Per Capita US$)_1y', 'GDP_(US$)_1t',
#             'GDP_(US$)_1y', 'VOTER_TURNOUT', 'VALID_VOTES'] # All socioeconomic
# # feat_cols = ['Year', 'GNI (Per Capita US $)_1t', 'GNI (Per Capita US $)_1y', 'GDP_(US$)_1t',
# #             'GDP_(US$)_1y'] # National Economic trends
# # feat_cols = ['Inflation', 'GDP (Per Capita US$)_1y', 'GDP_(US$)_1t',
# #             'GDP_(US$)_1y'] # Household Purchasing Power

# target = 'P.N.M._VOTES_%'  # P.N.M. vote share as the target

# X = decisions[feat_cols]
# y = decisions[target]

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # # training set and test set
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=25)

# # model test
# regressor = DecisionTreeRegressor()

# regressor = regressor.fit(X_train, y_train)

# y_pred = regressor.predict(X_test)

# # performance tests 

# mse = metrics.mean_squared_error(y_test, y_pred)

# print("R^2 score:", metrics.r2_score(y_test, y_pred))

# print("Mean Squared Error:", mse)

# print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))

# results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# print(results.head())

##### UNC

In [None]:
# from sklearn.tree import DecisionTreeRegressor
# from sklearn import metrics

# decisions = master_df.copy()

# feat_cols = ['Year', 'Homicide_rate_per_100k', 'Annual (Percentage Change)', 'Inflation', 'Net Migration Rate', 'GNI (Per Capita US $)_1t', 'GNI (Per Capita US $)_1y', 'Inflation_1t', 'Inflation_1y', 
#             'Unemployment_1t','Unemployment_1y', 'Net Migration Rate_1t', 'Net Migration Rate_1y', 'GDP (Per Capita US$)_1t', 'GDP (Per Capita US$)_1y', 'GDP_(US$)_1t',
#             'GDP_(US$)_1y', 'VOTER_TURNOUT', 'VALID_VOTES'] # All socioeconomic

# target = 'U.N.C._VOTES_%'  # P.N.M. vote share as the target

# X = decisions[feat_cols]
# y = decisions[target]

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # # training set and test set
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=25)

# # model test
# regressor = DecisionTreeRegressor()

# regressor = regressor.fit(X_train, y_train)

# y_pred = regressor.predict(X_test)

# # performance tests 

# mse = metrics.mean_squared_error(y_test, y_pred)

# print("R^2 score:", metrics.r2_score(y_test, y_pred))

# print("Mean Squared Error:", mse)

# print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))

# results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# print(results.head())

### Decision Tree Classisfier w/o marginal

##### PNM

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.preprocessing import KBinsDiscretizer

# decisions = master_df.copy()

# ['Year', 'Homicide_rate_per_100k', 'Annual (Percentage Change)', 'Inflation', 'Net Migration Rate', 'GNI (Per Capita US $)_1t', 'GNI (Per Capita US $)_1y', 'Inflation_1t', 'Inflation_1y', 
#             'Unemployment_1t','Unemployment_1y', 'Net Migration Rate_1t', 'Net Migration Rate_1y', 'GDP (Per Capita US$)_1t', 'GDP (Per Capita US$)_1y', 'GDP_(US$)_1t',
#             'GDP_(US$)_1y', 'VOTER_TURNOUT', 'REJECTED_BALLOTS', 'VALID_VOTES'] # All socioeconomic

# # feat_cols = ['Year', 'GNI (Per Capita US $)_1t', 'GNI (Per Capita US $)_1y', 'GDP_(US$)_1t',
# #             'GDP_(US$)_1y'] # National Economic trends
# # feat_cols = ['Inflation', 'GDP (Per Capita US$)_1y', 'GDP_(US$)_1t', 'GDP_(US$)_1y']  # Household Purchasing Power

# target = 'P.N.M._VOTES_%'  # P.N.M. vote share as the target

# X = decisions[feat_cols] 
# y = decisions[target]  

# # discretize the target variable into bins
# discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')  # adjust `n_bins` as needed
# y_binned = discretizer.fit_transform(y.values.reshape(-1, 1)).ravel()

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # training set and test set
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_binned, test_size=0.1, random_state=25)

# # model test
# clf = DecisionTreeClassifier()

# clf = clf.fit(X_train, y_train)

# y_pred = clf.predict(X_test)

# # performance metrics 
# accuracy = metrics.accuracy_score(y_test, y_pred)

# print("Accuracy:", accuracy)

# print("Classification Report:")
# print(metrics.classification_report(y_test, y_pred))

# results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# print(results.head())

##### UNC

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.preprocessing import KBinsDiscretizer

# decisions = master_df.copy()

# ['Year', 'Homicide_rate_per_100k', 'Annual (Percentage Change)', 'Inflation', 'Net Migration Rate', 'GNI (Per Capita US $)_1t', 'GNI (Per Capita US $)_1y', 'Inflation_1t', 'Inflation_1y', 
#             'Unemployment_1t','Unemployment_1y', 'Net Migration Rate_1t', 'Net Migration Rate_1y', 'GDP (Per Capita US$)_1t', 'GDP (Per Capita US$)_1y', 'GDP_(US$)_1t',
#             'GDP_(US$)_1y', 'VOTER_TURNOUT', 'REJECTED_BALLOTS', 'VALID_VOTES'] # All socioeconomic

# # feat_cols = ['Year', 'GNI (Per Capita US $)_1t', 'GNI (Per Capita US $)_1y', 'GDP_(US$)_1t',
# #             'GDP_(US$)_1y'] # National Economic trends
# # feat_cols = ['Inflation', 'GDP (Per Capita US$)_1y', 'GDP_(US$)_1t', 'GDP_(US$)_1y']  # Household Purchasing Power

# target = 'U.N.C._VOTES_%'  # P.N.M. vote share as the target

# X = decisions[feat_cols] 
# y = decisions[target]  

# # discretize the target variable into bins
# discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')  # adjust `n_bins` as needed
# y_binned = discretizer.fit_transform(y.values.reshape(-1, 1)).ravel()

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # training set and test set
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_binned, test_size=0.1, random_state=25)

# # model test
# clf = DecisionTreeClassifier()

# clf = clf.fit(X_train, y_train)

# y_pred = clf.predict(X_test)

# # performance metrics 
# accuracy = metrics.accuracy_score(y_test, y_pred)

# print("Accuracy:", accuracy)

# print("Classification Report:")
# print(metrics.classification_report(y_test, y_pred))

# results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# print(results.head())

### Decision Tree Regressor and Classifier based on marginal Seats

In [None]:
# df = master_df.copy()

# df['Winning_Party'] = df[['P.N.M._VOTES_%', 'U.N.C._VOTES_%']].idxmax(axis=1)

# # adding a column for aa the margin of victory
# df['Margin_of_Victory'] =  df[['P.N.M._VOTES_%', 'U.N.C._VOTES_%']].apply(
#     lambda row: row.max() -  sorted(row)[-2], axis=1
# )

# # thresholds 
# safe_vote_share_threshold = 0.60 # Safe seat if vote share > 60%
# safe_margin_threshold = 0.20 # Safe seat if margin of victory > 20%
# marginal_margin_threshold = 0.10 # Margnal seat if margin of victory < 10%

# # identify safe seats
# safe_seats = df[
#     (df['Margin_of_Victory'] > safe_margin_threshold) |
#     (df[['P.N.M._VOTES_%', 'U.N.C._VOTES_%']].max(axis=1) > safe_vote_share_threshold)
# ]

# # marginal seats 
# marginal_seats = df[df['Margin_of_Victory'] < marginal_margin_threshold]

# print("Number of Safe Seats: ", len(safe_seats))
# print("Number of Marginal Seats: ", len(marginal_seats))

# df.to_csv(r"csvs/master_df_with_seat_analysis.csv", index=False)


### Decision Tree Regressor w/ marginal

In [None]:
# X_safe = safe_seats[feat_cols]
# y_safe = safe_seats['P.N.M._VOTES_%']
# z_safe = safe_seats['U.N.C._VOTES_%']

# scaler = StandardScaler()
# X_safe_scaled = scaler.fit_transform(X_safe)

# # training set and test set
# X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_safe_scaled, y_safe, test_size=0.2, random_state=42)
# X_u_train, X_u_test, y_u_train, y_u_test = train_test_split(X_safe_scaled, z_safe, test_size=0.2, random_state=42)

# # model train 
# # P.N.M.
# p_regressor = DecisionTreeRegressor(random_state=42)
# p_regressor.fit(X_p_train, y_p_train)

# # U.N.C.
# p_regressor = DecisionTreeRegressor(random_state=42)
# p_regressor.fit(X_u_train, y_u_train)

# y_p_pred = regressor.predict(X_p_test)
# y_u_pred = regressor.predict(X_u_test)

# # performance tests
# mse_p = mean_squared_error(y_p_test, y_p_pred)
# mse_u = mean_squared_error(y_u_test, y_u_pred)

# r2_p = r2_score(y_p_test, y_p_pred)
# r2_u = r2_score(y_u_test, y_u_pred)

# print(f"Mean Squared Error PNM: {mse_p}  UNC: {mse_u}")
# print(f"R² Score PNM: {r2_p} UNC: {r2_u}")

# # actual vs predicted values
# print("PNM")
# results = pd.DataFrame({'Actual': y_p_test, 'Predicted': y_p_pred})
# print(results.head())

# print("UNC")
# results_u = pd.DataFrame({'Actual': y_u_test, 'Predicted': y_u_pred})
# print(results_u.head())

### Decision Tree Classisfier w/ marginal

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# # Model Evaluation
# from sklearn.metrics import accuracy_score, classification_report

# X_safe = safe_seats[feat_cols]
# y_safe = safe_seats['Winning_Party']

# y_safe_encoded = y_safe.astype('category').cat.codes

# scaler = StandardScaler()
# X_safe_scaled = scaler.fit_transform(X_safe)

# # Split dataset into training set and test set
# X_safe_train, X_safe_test, y_safe_train, y_safe_test = train_test_split(X_safe_scaled, y_safe_encoded, test_size=0.2, random_state=25)

# classifier = DecisionTreeClassifier(random_state=25)
# classifier = classifier.fit(X_safe_train, y_safe_train)

# # Predict response for test dataset
# y_safe_pred = classifier.predict(X_safe_test)


# accuracy = accuracy_score(y_safe_test, y_safe_pred)
# print(f"Accuracy: {accuracy}")

# print("Classification Report:")
# print(classification_report(y_safe_test, y_safe_pred))

# results = pd.DataFrame({'Actual': y_safe_test, 'Predicted': y_safe_pred})
# print(results.head())

### Visualization

In [None]:
# # taken from datacamp
# # Install the required package
# # %pip install pydotplus

# from sklearn.tree import export_graphviz
# from six import StringIO  
# from IPython.display import Image  
# import pydotplus

# dot_data = StringIO()
# export_graphviz(clf, out_file=dot_data,  
#                 filled=True, rounded=True,
#                 special_characters=True,feature_names = feat_cols,class_names=['0','1'])
# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# graph.write_png('elections.png')
# Image(graph.create_png())

### BACKUP

In [None]:
# X_safe = safe_seats[feat_cols]
# y_safe = safe_seats['P.N.M._VOTES_%']

# scaler = StandardScaler()
# X_safe_scaled = scaler.fit_transform(X_safe)

# # training set and test set
# X_train, X_test, y_train, y_test = train_test_split(X_safe_scaled, y_safe, test_size=0.2, random_state=42)

# # model train 
# regressor = DecisionTreeRegressor(random_state=42)
# regressor.fit(X_train, y_train)

# y_pred = regressor.predict(X_test)

# # performance tests
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f"Mean Squared Error: {mse}")
# print(f"R² Score: {r2}")

# # actual vs predicted values
# results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# print(results.head())

### MLP Regression

In [None]:
# from sklearn.neural_network import MLPRegressor

# df = safe_seats.copy()

# X = df[features]  
# y = df['P.N.M._VOTES_%']

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'hidden_layer_sizes': [(100,), (100, 30), (50, 50), (150, 75, 30)],
#     'activation': ['relu', 'tanh'],
#     'solver': ['adam', 'sgd'],
#     'alpha': [0.0001, 0.001, 0.01],
#     'learning_rate': ['constant', 'adaptive']
# }

# mlp = MLPRegressor(max_iter=1000, random_state=42)

# grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='r2', n_jobs=-1)
# grid_search.fit(X_train, y_train)

# print("Best parameters:", grid_search.best_params_)
# print("Best score:", grid_search.best_score_)

In [None]:
# mlp = MLPRegressor(hidden_layer_sizes=(100,30), 
#                    activation='relu',  
#                    solver='adam',      
#                    alpha=0.0001,       
#                    max_iter=1000,
#                    random_state=42,
#                    learning_rate= 'constant')


# mlp.fit(X_train, y_train)

# y_pred = mlp.predict(X_test)
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f"MSE: {mse}")
# print(f"R² Score: {r2}")

# print("Actual vs Predicted:")
# results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# print(results.head())