In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
df = pd.read_csv('cleaned_election.csv')
df.head()

Unnamed: 0,state,constituency,name,winner,party,symbol,gender,criminal cases,age,category,education,assets,liabilities,general votes,postal votes,total votes,percent of electors,percent of total votes,total electors
0,Telangana,ADILABAD,SOYAM BAPU RAO,1,BJP,Lotus,MALE,52,52.0,ST,12th Pass,3099414.0,231450.0,376892,482,377374,25.330684,35.468248,1489790
1,Telangana,ADILABAD,Godam Nagesh,0,TRS,Car,MALE,0,54.0,ST,Post Graduate,18477888.0,847000.0,318665,149,318814,21.399929,29.96437,1489790
2,Telangana,ADILABAD,RATHOD RAMESH,0,INC,Hand,MALE,3,52.0,ST,12th Pass,36491000.0,15300000.0,314057,181,314238,21.092771,29.534285,1489790
3,Uttar Pradesh,AGRA,Satyapal Singh Baghel,1,BJP,Lotus,MALE,5,58.0,SC,Doctorate,74274036.0,8606522.0,644459,2416,646875,33.383823,56.464615,1937690
4,Uttar Pradesh,AGRA,Manoj Kumar Soni,0,BSP,Elephant,MALE,0,47.0,SC,Post Graduate,133784385.0,22251891.0,434199,1130,435329,22.46639,37.999125,1937690


In [25]:
df.drop(columns=['name', 'symbol', 'general votes', 'postal votes', 'total votes', 'percent of electors', 'percent of total votes'], axis = 1, inplace=True)

In [26]:
df.head()

Unnamed: 0,state,constituency,winner,party,gender,criminal cases,age,category,education,assets,liabilities,total electors
0,Telangana,ADILABAD,1,BJP,MALE,52,52.0,ST,12th Pass,3099414.0,231450.0,1489790
1,Telangana,ADILABAD,0,TRS,MALE,0,54.0,ST,Post Graduate,18477888.0,847000.0,1489790
2,Telangana,ADILABAD,0,INC,MALE,3,52.0,ST,12th Pass,36491000.0,15300000.0,1489790
3,Uttar Pradesh,AGRA,1,BJP,MALE,5,58.0,SC,Doctorate,74274036.0,8606522.0,1937690
4,Uttar Pradesh,AGRA,0,BSP,MALE,0,47.0,SC,Post Graduate,133784385.0,22251891.0,1937690


In [27]:
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['gender'].value_counts()

1    1760
0     258
Name: gender, dtype: int64

In [28]:
df['balance assets'] = df['assets'] - df['liabilities']
df.head()

Unnamed: 0,state,constituency,winner,party,gender,criminal cases,age,category,education,assets,liabilities,total electors,balance assets
0,Telangana,ADILABAD,1,BJP,1,52,52.0,ST,12th Pass,3099414.0,231450.0,1489790,2867964.0
1,Telangana,ADILABAD,0,TRS,1,0,54.0,ST,Post Graduate,18477888.0,847000.0,1489790,17630888.0
2,Telangana,ADILABAD,0,INC,1,3,52.0,ST,12th Pass,36491000.0,15300000.0,1489790,21191000.0
3,Uttar Pradesh,AGRA,1,BJP,1,5,58.0,SC,Doctorate,74274036.0,8606522.0,1937690,65667514.0
4,Uttar Pradesh,AGRA,0,BSP,1,0,47.0,SC,Post Graduate,133784385.0,22251891.0,1937690,111532494.0


In [29]:
edu_dict = {'Illiterate': 0, 'Literate': 1, '5th Pass': 2, '8th Pass': 3, '10th Pass': 4, 
 'Others': 5, '12th Pass': 6, 'Graduate': 7, 'Post Graduate': 8, 'Doctorate': 9}

In [30]:
df.replace({'education': edu_dict}, inplace=True)
df['education'].value_counts()

7    777
8    503
6    256
4    196
3     78
9     73
5     72
1     30
2     28
0      5
Name: education, dtype: int64

In [31]:
df.select_dtypes(include='object').columns

Index(['state', 'constituency', 'party', 'category'], dtype='object')

In [13]:
df = pd.get_dummies(columns=['state', 'constituency', 'party', 'category'], data = df)
df.head(2)

Unnamed: 0,winner,gender,criminal cases,age,education,assets,liabilities,total electors,balance assets,state_Andaman & Nicobar Islands,...,party_VPI,party_VSIP,party_WAP,party_WPOI,party_YKP,party_YSRCP,party_ravp,category_GENERAL,category_SC,category_ST
0,1,1,52,52.0,6,3099414.0,231450.0,1489790,2867964.0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,0,54.0,8,18477888.0,847000.0,1489790,17630888.0,0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
scale = StandardScaler()
columns_to_scale = ['criminal cases', 'age', 'assets', 'liabilities', 'total electors', 'balance assets']
df[columns_to_scale] = scale.fit_transform(df[columns_to_scale])
df[columns_to_scale].head(3)

Unnamed: 0,criminal cases,age,assets,liabilities,total electors,balance assets
0,6.620242,-0.023051,-0.311731,-0.218126,-0.538876,-0.295544
1,-0.190426,0.145491,-0.27442,-0.211243,-0.538876,-0.255507
2,0.202498,-0.023051,-0.230717,-0.049632,-0.538876,-0.245853


In [15]:
df.head()

Unnamed: 0,winner,gender,criminal cases,age,education,assets,liabilities,total electors,balance assets,state_Andaman & Nicobar Islands,...,party_VPI,party_VSIP,party_WAP,party_WPOI,party_YKP,party_YSRCP,party_ravp,category_GENERAL,category_SC,category_ST
0,1,1,6.620242,-0.023051,6,-0.311731,-0.218126,-0.538876,-0.295544,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,-0.190426,0.145491,8,-0.27442,-0.211243,-0.538876,-0.255507,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0.202498,-0.023051,6,-0.230717,-0.049632,-0.538876,-0.245853,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,0.464446,0.482577,9,-0.139047,-0.124477,0.876978,-0.125235,0,...,0,0,0,0,0,0,0,0,1,0
4,0,1,-0.190426,-0.444408,8,0.005336,0.028103,0.876978,-0.000851,0,...,0,0,0,0,0,0,0,0,1,0


In [16]:
X = df.drop('winner', axis = 1)
y = df['winner']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2)

In [18]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)

In [19]:
rf_predict = rf_model.predict(X_test)

In [20]:
confusion_matrix(y_test, rf_predict)

array([[286,  10],
       [ 33,  75]], dtype=int64)

In [21]:
accuracy_score(y_test, rf_predict)

0.8935643564356436

In [22]:
cons_per_state = {}
voters_per_state = {}
voters_per_cons = {}

In [32]:
subset = df[['state', 'constituency', 'total electors']]
gk = subset.groupby('state')

In [33]:
# for each state
for name,group in gk:
    # total constituencies per state
    cons_per_state[name] = len(group)
    # total voters per state
    voters_per_state[name] = group['total electors'].sum()

In [34]:
# Total voters per constituency
subset = df[['constituency', 'total electors']]
gk2 = subset.groupby('constituency')

In [35]:
# for each constituency
for name,group in gk2:
    voters_per_cons[name] = len(group)

In [36]:
total_cons_per_state = []
total_voters_per_state = []
total_voters_per_cons = []

In [37]:
for row in df.itertuples():
    subkey = row.constituency + '_' + row.party
    total_cons_per_state.append(cons_per_state.get(row.state))
    total_voters_per_state.append(voters_per_state.get(row.state))
    total_voters_per_cons.append(voters_per_cons.get(row.constituency))

In [38]:
df['total_cons_per_state'] = total_cons_per_state #
df['total_voters_per_state'] = total_voters_per_state #
df['total_voters_per_cons'] = total_voters_per_cons #

In [39]:
df.head()

Unnamed: 0,state,constituency,winner,party,gender,criminal cases,age,category,education,assets,liabilities,total electors,balance assets,total_cons_per_state,total_voters_per_state,total_voters_per_cons
0,Telangana,ADILABAD,1,BJP,1,52,52.0,ST,6,3099414.0,231450.0,1489790,2867964.0,74,127212819,3
1,Telangana,ADILABAD,0,TRS,1,0,54.0,ST,8,18477888.0,847000.0,1489790,17630888.0,74,127212819,3
2,Telangana,ADILABAD,0,INC,1,3,52.0,ST,6,36491000.0,15300000.0,1489790,21191000.0,74,127212819,3
3,Uttar Pradesh,AGRA,1,BJP,1,5,58.0,SC,9,74274036.0,8606522.0,1937690,65667514.0,251,458868319,3
4,Uttar Pradesh,AGRA,0,BSP,1,0,47.0,SC,8,133784385.0,22251891.0,1937690,111532494.0,251,458868319,3


In [44]:
#df.to_csv('new_data.csv', index=False)

In [4]:
df = pd.read_csv('new_data.csv')
df.head()

Unnamed: 0,state,constituency,winner,party,gender,criminal cases,age,category,education,assets,liabilities,total electors,balance assets,total_cons_per_state,total_voters_per_state,total_voters_per_cons
0,Telangana,ADILABAD,1,BJP,1,52,52.0,ST,6,3099414.0,231450.0,1489790,2867964.0,74,127212819,3
1,Telangana,ADILABAD,0,TRS,1,0,54.0,ST,8,18477888.0,847000.0,1489790,17630888.0,74,127212819,3
2,Telangana,ADILABAD,0,INC,1,3,52.0,ST,6,36491000.0,15300000.0,1489790,21191000.0,74,127212819,3
3,Uttar Pradesh,AGRA,1,BJP,1,5,58.0,SC,9,74274036.0,8606522.0,1937690,65667514.0,251,458868319,3
4,Uttar Pradesh,AGRA,0,BSP,1,0,47.0,SC,8,133784385.0,22251891.0,1937690,111532494.0,251,458868319,3


In [5]:
df.drop(columns=['total_cons_per_state', 'total_voters_per_state', 'total_voters_per_cons'],axis = 1, inplace = True)

In [6]:
df.head(2)

Unnamed: 0,state,constituency,winner,party,gender,criminal cases,age,category,education,assets,liabilities,total electors,balance assets
0,Telangana,ADILABAD,1,BJP,1,52,52.0,ST,6,3099414.0,231450.0,1489790,2867964.0
1,Telangana,ADILABAD,0,TRS,1,0,54.0,ST,8,18477888.0,847000.0,1489790,17630888.0


In [61]:
df.select_dtypes(exclude='object').columns

Index(['winner', 'gender', 'criminal cases', 'age', 'education', 'assets',
       'liabilities', 'total electors', 'balance assets',
       'total_cons_per_state', 'total_voters_per_state',
       'total_voters_per_cons'],
      dtype='object')

In [62]:
new_to_scale = ['criminal cases', 'age', 'education', 'assets',
       'liabilities', 'total electors', 'balance assets',
       'total_cons_per_state', 'total_voters_per_state',
       'total_voters_per_cons']

In [63]:
df[new_to_scale] = scale.fit_transform(df[new_to_scale])

In [64]:
df.head()

Unnamed: 0,state,constituency,winner,party,gender,criminal cases,age,category,education,assets,liabilities,total electors,balance assets,total_cons_per_state,total_voters_per_state,total_voters_per_cons
0,Telangana,ADILABAD,1,BJP,1,6.620242,-0.023051,ST,-0.293924,-0.311731,-0.218126,-0.538876,-0.295544,-0.776598,-0.739586,-0.763522
1,Telangana,ADILABAD,0,TRS,1,-0.190426,0.145491,ST,0.878286,-0.27442,-0.211243,-0.538876,-0.255507,-0.776598,-0.739586,-0.763522
2,Telangana,ADILABAD,0,INC,1,0.202498,-0.023051,ST,-0.293924,-0.230717,-0.049632,-0.538876,-0.245853,-0.776598,-0.739586,-0.763522
3,Uttar Pradesh,AGRA,1,BJP,1,0.464446,0.482577,SC,1.464391,-0.139047,-0.124477,0.876978,-0.125235,1.52078,1.63344,-0.763522
4,Uttar Pradesh,AGRA,0,BSP,1,-0.190426,-0.444408,SC,0.878286,0.005336,0.028103,0.876978,-0.000851,1.52078,1.63344,-0.763522


In [65]:
df = pd.get_dummies(data=df)

In [66]:
df.head(3)

Unnamed: 0,winner,gender,criminal cases,age,education,assets,liabilities,total electors,balance assets,total_cons_per_state,...,party_VPI,party_VSIP,party_WAP,party_WPOI,party_YKP,party_YSRCP,party_ravp,category_GENERAL,category_SC,category_ST
0,1,1,6.620242,-0.023051,-0.293924,-0.311731,-0.218126,-0.538876,-0.295544,-0.776598,...,0,0,0,0,0,0,0,0,0,1
1,0,1,-0.190426,0.145491,0.878286,-0.27442,-0.211243,-0.538876,-0.255507,-0.776598,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0.202498,-0.023051,-0.293924,-0.230717,-0.049632,-0.538876,-0.245853,-0.776598,...,0,0,0,0,0,0,0,0,0,1


In [67]:
X = df.drop('winner', axis=1)
y = df['winner']

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2)

In [69]:
new_rf_model = rf.fit(X_train, y_train)

In [70]:
new_rf_predict = new_rf_model.predict(X_test)

In [71]:
confusion_matrix(new_rf_predict, y_test)

array([[284,  33],
       [ 12,  75]], dtype=int64)

In [72]:
accuracy_score(y_test, new_rf_predict)

0.8886138613861386

In [27]:
df.groupby('party')['winner'].count().nlargest(15).index

Index(['BJP', 'INC', 'IND', 'BSP', 'CPI(M)', 'AITC', 'VBA', 'SP', 'NTK', 'MNM',
       'SHS', 'AAP', 'TDP', 'YSRCP', 'DMK'],
      dtype='object', name='party')

In [28]:
largest_parties = ['BJP', 'INC', 'IND', 'BSP', 'CPI(M)', 'AITC', 'VBA', 'SP', 'NTK', 'MNM',
       'SHS', 'AAP', 'TDP', 'YSRCP', 'DMK']

In [29]:
for idx, party in enumerate(df['party']):
    if party in largest_parties:
        continue
    else:
        df['party'][idx] = 'Other'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['party'][idx] = 'Other'


In [30]:
df['party'].value_counts()

BJP       420
INC       413
Other     390
IND       201
BSP       163
CPI(M)    100
VBA        47
AITC       47
SP         39
NTK        38
MNM        36
SHS        26
TDP        25
YSRCP      25
AAP        25
DMK        23
Name: party, dtype: int64

In [31]:
df.head()

Unnamed: 0,state,constituency,winner,party,gender,criminal cases,age,category,education,assets,liabilities,total electors,balance assets
0,Telangana,ADILABAD,1,BJP,1,52,52.0,ST,6,3099414.0,231450.0,1489790,2867964.0
1,Telangana,ADILABAD,0,Other,1,0,54.0,ST,8,18477888.0,847000.0,1489790,17630888.0
2,Telangana,ADILABAD,0,INC,1,3,52.0,ST,6,36491000.0,15300000.0,1489790,21191000.0
3,Uttar Pradesh,AGRA,1,BJP,1,5,58.0,SC,9,74274036.0,8606522.0,1937690,65667514.0
4,Uttar Pradesh,AGRA,0,BSP,1,0,47.0,SC,8,133784385.0,22251891.0,1937690,111532494.0


In [35]:
#df.to_csv('new_cleaned.csv', index=False)

In [36]:
df = pd.read_csv('new_cleaned.csv')
df.head()

Unnamed: 0,state,constituency,winner,party,gender,criminal cases,age,category,education,assets,liabilities,total electors,balance assets
0,Telangana,ADILABAD,1,BJP,1,52,52.0,ST,6,3099414.0,231450.0,1489790,2867964.0
1,Telangana,ADILABAD,0,Other,1,0,54.0,ST,8,18477888.0,847000.0,1489790,17630888.0
2,Telangana,ADILABAD,0,INC,1,3,52.0,ST,6,36491000.0,15300000.0,1489790,21191000.0
3,Uttar Pradesh,AGRA,1,BJP,1,5,58.0,SC,9,74274036.0,8606522.0,1937690,65667514.0
4,Uttar Pradesh,AGRA,0,BSP,1,0,47.0,SC,8,133784385.0,22251891.0,1937690,111532494.0


In [37]:
df.drop('constituency', axis=1, inplace = True)

In [38]:
df.head()

Unnamed: 0,state,winner,party,gender,criminal cases,age,category,education,assets,liabilities,total electors,balance assets
0,Telangana,1,BJP,1,52,52.0,ST,6,3099414.0,231450.0,1489790,2867964.0
1,Telangana,0,Other,1,0,54.0,ST,8,18477888.0,847000.0,1489790,17630888.0
2,Telangana,0,INC,1,3,52.0,ST,6,36491000.0,15300000.0,1489790,21191000.0
3,Uttar Pradesh,1,BJP,1,5,58.0,SC,9,74274036.0,8606522.0,1937690,65667514.0
4,Uttar Pradesh,0,BSP,1,0,47.0,SC,8,133784385.0,22251891.0,1937690,111532494.0


In [41]:
df['state'].value_counts().index

Index(['Uttar Pradesh', 'Bihar', 'Tamil Nadu', 'Maharashtra', 'West Bengal',
       'Andhra Pradesh', 'Madhya Pradesh', 'Rajasthan', 'Telangana', 'Odisha',
       'Karnataka', 'Gujarat', 'Jharkhand', 'Kerala', 'Punjab', 'Assam',
       'Haryana', 'Chhattisgarh', 'Jammu & Kashmir', 'NCT OF Delhi',
       'Uttarakhand', 'Manipur', 'Arunachal Pradesh', 'Himachal Pradesh',
       'Tripura', 'Dadra & Nagar Haveli', 'Goa', 'Meghalaya',
       'Andaman & Nicobar Islands', 'Mizoram', 'Puducherry', 'Chandigarh',
       'Sikkim', 'Lakshadweep', 'Daman & Diu', 'Nagaland'],
      dtype='object')

In [42]:
list_of_states = ['Uttar Pradesh', 'Bihar', 'Tamil Nadu', 'Maharashtra', 'West Bengal',
       'Andhra Pradesh', 'Madhya Pradesh', 'Rajasthan', 'Telangana', 'Odisha',
       'Karnataka', 'Gujarat', 'Jharkhand', 'Kerala', 'Punjab', 'Assam',
       'Haryana', 'Chhattisgarh', 'Jammu & Kashmir', 'NCT OF Delhi',
       'Uttarakhand', 'Manipur', 'Arunachal Pradesh', 'Himachal Pradesh',
       'Tripura', 'Dadra & Nagar Haveli', 'Goa', 'Meghalaya',
       'Andaman & Nicobar Islands', 'Mizoram', 'Puducherry', 'Chandigarh',
       'Sikkim', 'Lakshadweep', 'Daman & Diu', 'Nagaland']

In [43]:
list_of_states.sort()

In [44]:
list_of_states

['Andaman & Nicobar Islands',
 'Andhra Pradesh',
 'Arunachal Pradesh',
 'Assam',
 'Bihar',
 'Chandigarh',
 'Chhattisgarh',
 'Dadra & Nagar Haveli',
 'Daman & Diu',
 'Goa',
 'Gujarat',
 'Haryana',
 'Himachal Pradesh',
 'Jammu & Kashmir',
 'Jharkhand',
 'Karnataka',
 'Kerala',
 'Lakshadweep',
 'Madhya Pradesh',
 'Maharashtra',
 'Manipur',
 'Meghalaya',
 'Mizoram',
 'NCT OF Delhi',
 'Nagaland',
 'Odisha',
 'Puducherry',
 'Punjab',
 'Rajasthan',
 'Sikkim',
 'Tamil Nadu',
 'Telangana',
 'Tripura',
 'Uttar Pradesh',
 'Uttarakhand',
 'West Bengal']

In [45]:
len(list_of_states)

36