In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import pad_sequences
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
df = pd.read_csv('/content/drive/MyDrive/Final Election Project/Complete 2002.csv')

In [29]:
df.head()

Unnamed: 0,election_date,election_type,contest_status,constituency_number,constituency_name,province,assembly,voter_reg,validated_votes,votes_disq,candidateID,candidate_name,candidate_party,candidate_votes,candidate_share,candidate_rank,outcome
0,2002-10-10,General Election,Contested,NA-1,Peshawar 1,Khyber Paktunkhwa (NWFP),National,233907.0,65642.0,1552.0,cxEaOeoEpx,Shabir Ahmad,Muttahidda Majlis-e-Amal Pakistan,37179.0,0.56639,1.0,Win
1,2002-10-10,General Election,Contested,NA-1,Peshawar 1,Khyber Paktunkhwa (NWFP),National,233907.0,65642.0,1552.0,JXBry680U4,Usman Bashir Bilour,Awami National Party,23002.0,0.350416,2.0,Loss
2,2002-10-10,General Election,Contested,NA-1,Peshawar 1,Khyber Paktunkhwa (NWFP),National,233907.0,65642.0,1552.0,20bRcNfCcx,Sajid Abdullah,Pakistan Tehreek-e-Insaf,2029.0,0.03091,3.0,Loss
3,2002-10-10,General Election,Contested,NA-1,Peshawar 1,Khyber Paktunkhwa (NWFP),National,233907.0,65642.0,1552.0,3d2MkDSVO4,"M. Khurshid Khan, Adv.",Independents,1537.0,0.023415,4.0,Loss
4,2002-10-10,General Election,Contested,NA-1,Peshawar 1,Khyber Paktunkhwa (NWFP),National,233907.0,65642.0,1552.0,GZMMicWhS6,Muhammad Muazzam Butt,Pakistan Muslim League (Qaid-e-Azam),1417.0,0.021587,5.0,Loss


In [5]:
desire_col = ['constituency_number','province','candidate_name','candidate_party','voter_reg','outcome']
data = df[desire_col]

In [6]:
df.isnull().sum()

election_date          0
election_type          0
contest_status         0
constituency_number    0
constituency_name      0
province               0
assembly               0
voter_reg              0
validated_votes        0
votes_disq             0
candidateID            0
candidate_name         0
candidate_party        0
candidate_votes        0
candidate_share        0
candidate_rank         0
outcome                0
dtype: int64

In [7]:
len(df['constituency_number'].unique())

272

In [30]:
max_candidate_name_length = 40
max_candidate_party_length = 40

In [31]:
def tokenize_and_pad(data, max_len):
    sequences = pad_sequences(data, maxlen=max_len, padding='post')
    return sequences.tolist()

In [32]:
data['candidate_name'] = data.groupby('constituency_number')['candidate_name'].transform(lambda x: x.astype(str).tolist())
data['candidate_party'] = data.groupby('constituency_number')['candidate_party'].transform(lambda x: x.astype(str).tolist())

# Extract unique constituencies
constituencies = data['constituency_number'].unique()

In [33]:
all_features, all_labels = [], []

for constituency_number in constituencies:
    group = data[data['constituency_number'] == constituency_number]

    features = group[['province', 'candidate_name', 'candidate_party', 'voter_reg' ]]
    labels = group['outcome'].astype('category').cat.codes

    all_features.append(features)
    all_labels.append(labels)

processed_data = pd.concat(all_features, ignore_index=True)
processed_data['outcome'] = pd.concat(all_labels, ignore_index=True)

In [34]:
# Convert text columns to sequences
tok = Tokenizer(num_words=10000)
tok.fit_on_texts(processed_data['candidate_name'].astype(str))
processed_data['candidate_name_padded'] = tok.texts_to_sequences(processed_data['candidate_name'].astype(str))

tok.fit_on_texts(processed_data['candidate_party'].astype(str))
processed_data['candidate_party_padded'] = tok.texts_to_sequences(processed_data['candidate_party'].astype(str))

In [36]:
X_train, X_test, y_train, y_test = train_test_split(processed_data[['province', 'candidate_name_padded', 'candidate_party_padded','voter_reg']], processed_data['outcome'], test_size=0.2, random_state=42)

In [37]:
from keras.layers import Bidirectional

# Model
model = Sequential()
model.add(Embedding(10000, 128, input_length=max_candidate_name_length))
model.add(LSTM(64, return_sequences=True))  # Return sequences to pass to the next LSTM layer
model.add(Dropout(0.2))
model.add(LSTM(32, return_sequences=True))  # Return sequences again
model.add(LSTM(32))  # No need to return sequences for the last LSTM layer
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [38]:
X_train_padded = pad_sequences(X_train['candidate_name_padded'].tolist(), maxlen=max_candidate_name_length, padding='post', truncating='post', dtype='float32')

# Assuming max_candidate_name_length is the length of your padded sequences
timesteps = max_candidate_name_length
features = 1  # Assuming each element in the sequence is a single feature

X_train_padded = X_train_padded.reshape((X_train_padded.shape[0], timesteps, features))

# Train the model
model.fit(X_train_padded, y_train, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7933b4172bf0>

In [43]:
model.save('/content/drive/MyDrive/Final Election Project/2002_model.h5')
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 40, 128)           1280000   
                                                                 
 lstm_9 (LSTM)               (None, 40, 64)            49408     
                                                                 
 dropout_3 (Dropout)         (None, 40, 64)            0         
                                                                 
 lstm_10 (LSTM)              (None, 40, 32)            12416     
                                                                 
 lstm_11 (LSTM)              (None, 32)                8320      
                                                                 
 dense_6 (Dense)             (None, 16)                528       
                                                                 
 dense_7 (Dense)             (None, 1)                

  saving_api.save_model(


In [40]:
X_test_padded = pad_sequences(X_test['candidate_name_padded'].tolist(), maxlen=max_candidate_name_length, padding='post', truncating='post', dtype='float32')
X_test_padded = X_test_padded.reshape((X_test_padded.shape[0], timesteps, features))

# Evaluate the model on the test set
evaluation = model.evaluate(X_test_padded, y_test)
print("Test Loss:", evaluation[0])
print("Test Accuracy:", evaluation[1])

Test Loss: 0.40105652809143066
Test Accuracy: 0.8623188138008118


In [42]:
predictions_2002 = model.predict(X_test_padded)
predictions_2002



array([[0.1290478 ],
       [0.12904698],
       [0.12903276],
       [0.12902714],
       [0.12903674],
       [0.12903425],
       [0.1290258 ],
       [0.12902892],
       [0.12903126],
       [0.12902819],
       [0.12903018],
       [0.1290279 ],
       [0.12905715],
       [0.1290495 ],
       [0.12902795],
       [0.12903033],
       [0.12903528],
       [0.12902519],
       [0.12902784],
       [0.12906814],
       [0.1290326 ],
       [0.1290307 ],
       [0.1290263 ],
       [0.12903553],
       [0.12903868],
       [0.1290278 ],
       [0.1290407 ],
       [0.12903407],
       [0.1290266 ],
       [0.1290289 ],
       [0.12902428],
       [0.12904032],
       [0.12903105],
       [0.12903687],
       [0.12903354],
       [0.12902585],
       [0.12904193],
       [0.12902953],
       [0.12902823],
       [0.12903829],
       [0.12902682],
       [0.12903136],
       [0.12903795],
       [0.12903702],
       [0.12903406],
       [0.12903719],
       [0.12902725],
       [0.129