# Classifications

In [None]:
Special case has been take to find number of epochs where loss stabilizes

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from itertools import combinations
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

## 1. Are competitive elections governed by voter turnout and number of candidates?

In [18]:
def is_comp(min_turnout_perc = 30, competitive_cutoff = 3, num_epochs = 10000):
    required_columns = [
        'Constituency_Name',
        'N_Cand',
        'Turnout_Percentage',
        'Position',
        'Vote_Share_Percentage'
    ]

    df = pd.read_csv('./data/All_States_GE.csv', keep_default_na=False)[required_columns]
    cleaned_df = df[df['N_Cand'] > 1]
    cleaned_df = cleaned_df[cleaned_df['Constituency_Name'] != '']
    cleaned_df = cleaned_df[cleaned_df['Turnout_Percentage'] != '']
    cleaned_df = cleaned_df[cleaned_df['Vote_Share_Percentage'] != '']

    cleaned_df['Turnout_Percentage'] = cleaned_df['Turnout_Percentage'].astype(float)
    cleaned_df = cleaned_df[cleaned_df['Turnout_Percentage'] > min_turnout_perc]
    
    cleaned_df['Vote_Share_Percentage'] = cleaned_df['Vote_Share_Percentage'].astype(float)
    
    cleaned_df = cleaned_df[cleaned_df['Position'].isin([1, 2])]
    only_winner_df = cleaned_df[cleaned_df['Position'].isin([1])]

    df = cleaned_df.reset_index(drop=True)
    only_winner_df = only_winner_df.reset_index(drop=True)
    
    only_winner_df['Vote_Share_Difference'] = df[df['Position'] == 1]['Vote_Share_Percentage'].values - df[df['Position'] == 2]['Vote_Share_Percentage'].values
    
    only_winner_df['Competitive_Election'] = only_winner_df['Vote_Share_Difference'] >= competitive_cutoff
    only_winner_df['Competitive_Election'] = only_winner_df['Competitive_Election'].astype(float)
    df = only_winner_df.reset_index(drop=True)
    
    # print(df.head())
    
    X = df[['N_Cand', 'Turnout_Percentage']]
    y = df['Competitive_Election']

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

    class LogisticRegressionModel(nn.Module):
        def __init__(self, input_dim):
            super(LogisticRegressionModel, self).__init__()
            self.linear = nn.Linear(input_dim, 1)

        def forward(self, x):
            out = torch.sigmoid(self.linear(x))
            return out

    input_dim = X_train_tensor.shape[1]
    model = LogisticRegressionModel(input_dim)

    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    for epoch in range(num_epochs):
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (epoch + 1) % (num_epochs//5) == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        predicted = (test_outputs >= 0.5).float()

    accuracy = accuracy_score(y_test_tensor, predicted)
    precision = precision_score(y_test_tensor, predicted)
    recall = recall_score(y_test_tensor, predicted)
    f1 = f1_score(y_test_tensor, predicted)

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    
    return float(accuracy), float(precision), float(recall), float(f1)

In [19]:
table_data = []
table_head = ["min_turnout_perc", "competitive_cutoff", "accuracy", "precision", "recall", "f1"]
for min_turnout_perc in [10, 15, 20, 30]:
    for competitive_cutoff in [0.5, 1, 1.5, 2, 3, 5]:
        print("\t", min_turnout_perc, competitive_cutoff)
        accuracy, precision, recall, f1 = is_comp(min_turnout_perc, competitive_cutoff)
        print("---**---\n")
        
        table_data.append([min_turnout_perc, competitive_cutoff, accuracy, precision, recall, f1])

	 10 0.5
Epoch [2000/10000], Loss: 0.1283
Epoch [4000/10000], Loss: 0.1133
Epoch [6000/10000], Loss: 0.1103
Epoch [8000/10000], Loss: 0.1095
Epoch [10000/10000], Loss: 0.1092
Accuracy: 0.9746
Precision: 0.9746
Recall: 1.0000
F1 Score: 0.9871
---**---

	 10 1
Epoch [2000/10000], Loss: 0.2022
Epoch [4000/10000], Loss: 0.1935
Epoch [6000/10000], Loss: 0.1926
Epoch [8000/10000], Loss: 0.1925
Epoch [10000/10000], Loss: 0.1925
Accuracy: 0.9625
Precision: 0.9625
Recall: 1.0000
F1 Score: 0.9809
---**---

	 10 1.5
Epoch [2000/10000], Loss: 0.2607
Epoch [4000/10000], Loss: 0.2570
Epoch [6000/10000], Loss: 0.2535
Epoch [8000/10000], Loss: 0.2522
Epoch [10000/10000], Loss: 0.2522
Accuracy: 0.9274
Precision: 0.9274
Recall: 1.0000
F1 Score: 0.9623
---**---

	 10 2
Epoch [2000/10000], Loss: 0.3153
Epoch [4000/10000], Loss: 0.3127
Epoch [6000/10000], Loss: 0.3127
Epoch [8000/10000], Loss: 0.3127
Epoch [10000/10000], Loss: 0.3127
Accuracy: 0.9105
Precision: 0.9105
Recall: 1.0000
F1 Score: 0.9531
---**-

In [20]:
def format_max(value, cutoff = 0.95):
    if value >= cutoff:
        return f"\033[1;31m{value}\033[0m"
    else:
        return value
    
table_data = [[min_turnout_perc, competitive_cutoff, format_max(accuracy), format_max(precision), format_max(recall), format_max(f1)] for [min_turnout_perc, competitive_cutoff, accuracy, precision, recall, f1] in table_data]

print(tabulate(table_data, headers=table_head, tablefmt="grid"))

+--------------------+----------------------+------------+-------------+----------+----------+
|   min_turnout_perc |   competitive_cutoff |   accuracy |   precision |   recall |       f1 |
|                 10 |                  0.5 |   [1;31m0.974592[0m |    [1;31m0.974592[0m | [1;31m1[0m        | [1;31m0.987132[0m |
+--------------------+----------------------+------------+-------------+----------+----------+
|                 10 |                  1   |   [1;31m0.962492[0m |    [1;31m0.962492[0m | [1;31m1[0m        | [1;31m0.980888[0m |
+--------------------+----------------------+------------+-------------+----------+----------+
|                 10 |                  1.5 |   0.927405 |    0.927405 | [1;31m1[0m        | [1;31m0.962335[0m |
+--------------------+----------------------+------------+-------------+----------+----------+
|                 10 |                  2   |   0.910466 |    0.910466 | [1;31m1[0m        | [1;31m0.953135[0m |
+------------

### Conclusions

We have aimed to predict whether an election was competitive using the number of candidates that were standing and the turnout percentage.
- Number of candidates as a higher number of candidates indicates that the constituency is important or large enough to warrant a good competitive election
- Turnout percentage as a a higher turnout percentage indicates that people have differing affiliations in the constituency and that each group would like for their supported candidate to win. If turnout percentage is low then it might be the case that almost everyone in the constituency already have an idea of who is going to win and hence, the election wouldn't be competitive.

We verify these claims by running logistic regression with the inputs as number of candidates and voter turnout percentage and attempt to classify an election as competitive or not.

We define a competitive election as one in which the first and second candidates differ by `competitive_cutoff` in their vote shares. 

Based on my results we can say with confidence that our initial hypotheses were correct as all considered measures report high scores especially in the case in which the cutoff for competitive elections are very stringent (0.5 and 1.0%)


## 2. How many terms has the MP won based on a few parameters

In [3]:
df = pd.read_csv('./data/All_States_GE.csv', keep_default_na=False)
df['No_Terms'].unique()

array(['1', '0', '', '2', '5', '4', '3', '6', '7', '8', '9', '10', '11',
       0, 3, 1, 2, 4, 5, 6, 7, 8, 10, 9], dtype=object)

In [3]:
def find_terms(required_columns = ['State_Name', 'Position', 'Party', 'Deposit_Lost', 'Contested', 'No_Terms', 'Turncoat'], num_epochs = 10000):
    major_columns = ['State_Name', 'Position', 'Party', 'Deposit_Lost', 'Contested', 'No_Terms', 'Turncoat']
    
    df = pd.read_csv('./data/All_States_GE.csv', keep_default_na=False)[major_columns]
    
    cleaned_df = df[df['Position'] != '']
    cleaned_df = cleaned_df[cleaned_df['State_Name'] != '']
    cleaned_df = cleaned_df[cleaned_df['Party'] != '']
    cleaned_df = cleaned_df[cleaned_df['Deposit_Lost'] != '']
    cleaned_df = cleaned_df[cleaned_df['Contested'] != '']
    cleaned_df = cleaned_df[cleaned_df['No_Terms'] != '']
    cleaned_df = cleaned_df[cleaned_df['Turncoat'] != '']
    
    cleaned_df['Position'] = cleaned_df['Position'].astype(int)
    cleaned_df['Deposit_Lost'] = cleaned_df['Deposit_Lost'].astype(str)
    cleaned_df['Contested'] = cleaned_df['Contested'].astype(int)
    cleaned_df['No_Terms'] = cleaned_df['No_Terms'].astype(int)
    cleaned_df['Turncoat'] = cleaned_df['Turncoat'].astype(bool)

    df = cleaned_df[required_columns]
    
    label_encoders = {}
    for column in df.columns:
        if df[column].dtype == 'object':
            label_encoders[column] = LabelEncoder()
            df[column] = label_encoders[column].fit_transform(df[column])
    
    # print(df.head())
    
    X = df.drop(columns=['No_Terms'])
    y = df['No_Terms']
    
    def map_to_categories(terms):
        return terms//3

    y = y.map(map_to_categories)
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.long)

    X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)
    
    # return
    
    class LogisticRegressionModel(nn.Module):
        def __init__(self, input_dim, output_dim):
            super(LogisticRegressionModel, self).__init__()
            self.linear = nn.Linear(input_dim, output_dim)

        def forward(self, x):
            out = self.linear(x)
            return out

    input_dim = X_train.shape[1]
    output_dim = len(y.unique())
    model = LogisticRegressionModel(input_dim, output_dim)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    for epoch in range(num_epochs):
        outputs = model(X_train)
        loss = criterion(outputs, y_train)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (epoch + 1) % (num_epochs//5) == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test)
        _, predicted = torch.max(test_outputs.data, 1)
        
    predicted_categories = [y.unique()[p] for p in predicted]

    accuracy = accuracy_score(y_test, predicted_categories)
    precision = precision_score(y_test, predicted_categories, average='micro')
    recall = recall_score(y_test, predicted_categories, average='micro')
    f1 = f1_score(y_test, predicted_categories, average='micro')

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    
    return float(accuracy), float(precision), float(recall), float(f1)

In [4]:
def find_subsets(input_list):
    all_subsets = []
    for subset_length in range(5, len(input_list) + 1):
        subsets = combinations(input_list, subset_length)
        all_subsets.extend(subsets)
    return all_subsets

all_labels = ['State_Name', 'Position', 'Party', 'Deposit_Lost', 'Contested', 'Turncoat']
considered_labels = find_subsets(all_labels)

table_data = []
table_head = ["Columns Considered", "accuracy", "precision", "recall", "f1"]

for i in considered_labels:
    cur_list = list(i)
    cur_list.append('No_Terms')
    print("\t", i)
    accuracy, precision, recall, f1 = find_terms(required_columns = cur_list, num_epochs = 20000)
    table_data.append([str(i), accuracy, precision, recall, f1])
    print()

	 ('State_Name', 'Position', 'Party', 'Deposit_Lost', 'Contested')
Epoch [4000/20000], Loss: 0.0757
Epoch [8000/20000], Loss: 0.0641
Epoch [12000/20000], Loss: 0.0602
Epoch [16000/20000], Loss: 0.0581
Epoch [20000/20000], Loss: 0.0568
Accuracy: 0.9768
Precision: 0.9768
Recall: 0.9768
F1 Score: 0.9768

	 ('State_Name', 'Position', 'Party', 'Deposit_Lost', 'Turncoat')
Epoch [4000/20000], Loss: 0.1174
Epoch [8000/20000], Loss: 0.1088
Epoch [12000/20000], Loss: 0.1063
Epoch [16000/20000], Loss: 0.1051
Epoch [20000/20000], Loss: 0.1045
Accuracy: 0.9689
Precision: 0.9689
Recall: 0.9689
F1 Score: 0.9689

	 ('State_Name', 'Position', 'Party', 'Contested', 'Turncoat')
Epoch [4000/20000], Loss: 0.0854
Epoch [8000/20000], Loss: 0.0760
Epoch [12000/20000], Loss: 0.0727
Epoch [16000/20000], Loss: 0.0706
Epoch [20000/20000], Loss: 0.0692
Accuracy: 0.9745
Precision: 0.9745
Recall: 0.9745
F1 Score: 0.9745

	 ('State_Name', 'Position', 'Deposit_Lost', 'Contested', 'Turncoat')
Epoch [4000/20000], Loss: 

In [5]:
print(tabulate(table_data, headers=table_head, tablefmt="grid"))

+------------------------------------------------------------------------------+------------+-------------+----------+----------+
| Columns Considered                                                           |   accuracy |   precision |   recall |       f1 |
| ('State_Name', 'Position', 'Party', 'Deposit_Lost', 'Contested')             |   0.976757 |    0.976757 | 0.976757 | 0.976757 |
+------------------------------------------------------------------------------+------------+-------------+----------+----------+
| ('State_Name', 'Position', 'Party', 'Deposit_Lost', 'Turncoat')              |   0.968917 |    0.968917 | 0.968917 | 0.968917 |
+------------------------------------------------------------------------------+------------+-------------+----------+----------+
| ('State_Name', 'Position', 'Party', 'Contested', 'Turncoat')                 |   0.974493 |    0.974493 | 0.974493 | 0.974493 |
+------------------------------------------------------------------------------+----------

### Conclusion

The number of terms a candidate has won (including the current election) may depend on
- State
- Party
- Turncoat (Whether the candidate has changed parties)
- Position in the current election
- Whether the candidate lost their deposit (received less than 1/6th total votes)
- Number of elections the candidate has contested in

We classify the number of elections won as:
$$ floor(\frac{Wins}{3}) $$


We also consider the accuracy of the model by leaving out each of the factors to notice that the accuracy is diminished only in the case of neglecting whether the candidate had contested in the previous election cycle.

We gain the following insights:
- Contesting in the previous might not matter as much as the other factors to determine how many elections the candidate has won. So, people vote more on the basis of the other parameters.
- Dropping other factors doesn't cause much change in the accuracy of the model.

Further study on being given access to more compute can be done for the other subsets of `['State_Name', 'Position', 'Party', 'Deposit_Lost', 'Contested', 'Turncoat']`.
