In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
import warnings 
warnings.filterwarnings( "ignore")
import statsmodels.formula.api as sm
import scipy.stats as stats
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score,roc_curve,classification_report,confusion_matrix,plot_confusion_matrix

In [None]:
ele_df = pd.read_excel("Election_Data.xlsx",sheet_name ="Election_Dataset_Two Classes")
ele_df.head()

In [None]:
ele_df = ele_df.drop("Unnamed: 0",axis = 1)

In [None]:
ele_df.shape

In [None]:
ele_df.info()

In [None]:
ele_df.isnull().sum()


In [None]:
ele_df.describe(include = "all").T

In [None]:
ele_df.skew()

In [None]:
dups = ele_df.duplicated().sum()
print('Number of duplicate rows = %d' % (dups.sum()))

In [None]:
print('Before',ele_df.shape)
ele_df.drop_duplicates(inplace=True) 
print('After',ele_df.shape)

In [None]:
for column in ele_df.columns:
    if ele_df[column].dtype == 'object':
        print(column.upper(),': ',ele_df[column].nunique())
        print(ele_df[column].value_counts().sort_values())
        print('\n')

1.2 Perform Univariate and Bivariate Analysis. Do exploratory data analysis. Check for Outliers.

In [None]:
fig, axes = plt.subplots(nrows=7,ncols=2)
fig.set_size_inches(15,18)

a = sns.distplot(ele_df['age'] , ax=axes[0][0])
a.set_title("age Distribution",fontsize=10)
a = sns.boxplot(ele_df['age'] , orient = "v" , ax=axes[0][1])
a.set_title(" age",fontsize=10)

a = sns.distplot(ele_df['economic.cond.national'] , ax=axes[1][0])
a.set_title("economic.cond.national Distribution",fontsize=10)
a = sns.boxplot(ele_df['economic.cond.national'] , orient = "v" , ax=axes[1][1])
a.set_title("economic.cond.national Boxplot",fontsize=10)


a = sns.distplot(ele_df['economic.cond.household'] , ax=axes[2][0])
a.set_title("economic.cond.household Distribution",fontsize=10)
a = sns.boxplot(ele_df['economic.cond.household'] , orient = "v" , ax=axes[2][1])
a.set_title("economic.cond.household Boxplot",fontsize=10)


a = sns.distplot(ele_df['Blair'] , ax=axes[3][0])
a.set_title("Blair Distribution",fontsize=10)
a = sns.boxplot(ele_df['Blair'] , orient = "v" , ax=axes[3][1])
a.set_title("Blair Boxplot",fontsize=10)


a = sns.distplot(ele_df['Hague'] , ax=axes[4][0])
a.set_title("Hague Distribution",fontsize=10)
a = sns.boxplot(ele_df['Hague'] , orient = "v" , ax=axes[4][1])
a.set_title("Hague Boxplot",fontsize=10)


a = sns.distplot(ele_df['Europe'] , ax=axes[5][0])
a.set_title("Europe Distribution",fontsize=10)
a = sns.boxplot(ele_df['Europe'] , orient = "v" , ax=axes[5][1])
a.set_title("Europe Boxplot",fontsize=10)


a = sns.distplot(ele_df['political.knowledge'] , ax=axes[6][0])
a.set_title("political knowledge Distribution",fontsize=10)
a = sns.boxplot(ele_df['political.knowledge'] , orient = "v" , ax=axes[6][1])
a.set_title("political knowledge Boxplot",fontsize=10)

plt.show()

In [None]:
sns.barplot(ele_df.vote.value_counts().index,ele_df.vote.value_counts().values)
plt.title("vote: Party choice: Conservative or Labour")
plt.show()
print(ele_df.vote.value_counts(normalize=True))

In [None]:
sns.barplot(ele_df.gender.value_counts().index,ele_df.gender.value_counts().values)
plt.title("Gender")
plt.show()
print(ele_df.gender.value_counts(normalize=True))

In [None]:
sns.pairplot(ele_df,hue="vote")

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(round(ele_df.corr(),2), annot=True,mask=np.triu(ele_df.corr(),+1));
plt.title('Correlation Heatmap Plot')
plt.show()

In [None]:
ele_df.columns


In [None]:
plt.figure(figsize=(5,5))
ax = sns.barplot(x='vote', y='economic.cond.national',data=ele_df)
plt.title("vote vs economic.cond.national")

In [None]:
plt.figure(figsize=(10,7))
sns.stripplot(ele_df["economic.cond.national"], ele_df["age"], jitter=True) 
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.stripplot(ele_df["Hague"], ele_df['age'], jitter=True) 
plt.show()

In [None]:
#Catplot Analysis - Hague(count) on economic.cond.household
sns.catplot(x="Hague", kind="count",hue='economic.cond.household',data=ele_df)

In [None]:
#Catplot Analysis - Blair(count) on economic.cond.national
sns.catplot(x="Blair", kind="count",hue='economic.cond.national',data=ele_df)

In [None]:
#Catplot Analysis - Hague(count) on economic.cond.national
sns.catplot(x="Hague", kind="count",hue='economic.cond.national',data=ele_df)

In [None]:
#Catplot Analysis - Hague(count) on Europe
sns.catplot(x="Hague", kind="count",hue='Europe',data=ele_df)

In [None]:
sns.catplot(x="Blair", kind="count",hue='Europe',data=ele_df)


In [None]:
plt.subplots(figsize=(15,10))

sns.boxplot(data=ele_df,orient="h")
plt.show()

In [None]:
def remove_outlier(col):
    sorted(col)
    Q1,Q3=np.percentile(col,[25,75])
    IQR=Q3-Q1
    lower_range= Q1-(1.5 * IQR)
    upper_range= Q3+(1.5 * IQR)
    return lower_range, upper_range

In [None]:
dtypes=ele_df.dtypes[(ele_df.dtypes!= 'object')].index
for column in ele_df[dtypes].columns:
    lr,ur=remove_outlier(ele_df[column])
    ele_df[column]=np.where(ele_df[column]>ur,ur,ele_df[column])
    ele_df[column]=np.where(ele_df[column]<lr,lr,ele_df[column])

In [None]:
plt.subplots(figsize=(15,10))

sns.boxplot(data=ele_df,orient="h")
plt.show()

In [None]:
ele_df['age'] = ele_df['age'].astype('int64')
ele_df['economic.cond.national'] = ele_df['economic.cond.national'].astype('int64')
ele_df['economic.cond.household'] = ele_df['economic.cond.household'].astype('int64')
ele_df['Blair'] = ele_df['Blair'].astype('int64')
ele_df['Hague'] = ele_df['Hague'].astype('int64')
ele_df['Europe'] = ele_df['Europe'].astype('int64')
ele_df['political.knowledge'] = ele_df['political.knowledge'].astype('int64')

In [None]:
display(ele_df.dtypes)


1.3 Encode the data (having string values) for Modelling. Is Scaling necessary here or not? Data Split: Split the data into train and test (70:30).

In [None]:
#Converting Object variables to categorical variables:
for feature in ele_df.columns: 
    if ele_df[feature].dtype == 'object': 
        print('\n')
        print('feature:',feature)
        print(pd.Categorical(ele_df[feature].unique()))
        print(pd.Categorical(ele_df[feature].unique()).codes)
        ele_df[feature] = pd.Categorical(ele_df[feature]).codes

In [None]:
ele_df.vote.value_counts(normalize=True)


In [None]:
ele_df.info()


In [None]:
# Converting int8 to int64 variables
ele_df['vote'] = ele_df['vote'].astype('int64')
ele_df['gender'] = ele_df['gender'].astype('int64')

In [None]:
ele_df.info()


In [None]:
ele_df_copy = ele_df.copy()
ele_df_copy.head()

In [None]:
#Train-Test Split
X = ele_df.drop('vote', axis=1)

y = ele_df[['vote']]

In [None]:
#Applying Z score scaling on Variable X - To be used in specific algorithms 
age1=['age']
X_scaled=X.copy()
y_new = y.copy()
X_scaled[age1] = X_scaled[age1].apply(lambda x:(x-x.mean()) / (x.std()))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30 , random_state=1)

In [None]:
print('The training set for the independent variables:',X_train.shape)
print('The training set for the dependent variable:',y_train.shape)
print('The test set for the independent variables:',X_test.shape)
print('The test set for the dependent variable:',y_test.shape)

In [None]:
X_trains, X_tests, y_trains, y_tests = train_test_split(X_scaled, y_new, test_size=0.30 , random_state=1)


In [None]:
print('The training set for the independent variables:',X_trains.shape)
print('The training set for the dependent variable:',y_trains.shape)
print('The test set for the independent variables:',X_tests.shape)
print('The test set for the dependent variable:',y_tests.shape)

1.4 Apply Logistic Regression and LDA (linear discriminant analysis).


In [None]:
# Fit the Logistic Regression model
modellr = LogisticRegression()
modellr.fit(X_train, y_train)

In [None]:
ytrain_predictlr = modellr.predict(X_train)
ytest_predictlr = modellr.predict(X_test)

In [None]:
ytest_predict_problr=modellr.predict_proba(X_test)
pd.DataFrame(ytest_predict_problr).head()

In [None]:
modellr.score(X_train, y_train)


In [None]:
modellr.score(X_test, y_test)


In [None]:
grid1={'penalty':['l2','none'],
      'solver':['newton-cg',  'saga'],
      'max_iter':[10000,100000],
      'tol':[0.0001,0.001]
     }

In [None]:
modellr1 = LogisticRegression()


In [None]:
grid_searchlr = GridSearchCV(estimator = modellr1, param_grid = grid1, cv = 3,n_jobs=-1,scoring='accuracy')


In [None]:
grid_searchlr.fit(X_train, y_train)


In [None]:
print(grid_searchlr.best_params_,'\n')
print(grid_searchlr.best_estimator_)

In [None]:
best_modellr = grid_searchlr.best_estimator_


In [None]:
ytrain_predictlr = best_modellr.predict(X_train)
ytest_predictlr = best_modellr.predict(X_test)

In [None]:
ytrain_predict_problr=best_modellr.predict_proba(X_train)
pd.DataFrame(ytrain_predict_problr).head()

In [None]:
ytest_predict_problr=best_modellr.predict_proba(X_test)
pd.DataFrame(ytest_predict_problr).head()

In [None]:
accuracy_lr_train= best_modellr.score(X_train, y_train)
accuracy_lr_train

In [None]:
accuracy_lr_test= best_modellr.score(X_test, y_test)
accuracy_lr_test

In [None]:
print("The intercept for the model is :",best_modellr.intercept_)


In [None]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, best_modellr.coef_[0][idx]))

In [None]:
#Feature Importance Graphs
importance = best_modellr.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
# concatenate X and y into a single dataframe
data_train1 = pd.concat([X_train, y_train], axis=1)
data_test1=pd.concat([X_test,y_test],axis=1)
data_train1.head()

In [None]:
# Fit the Logistic Regression model
model = LogisticRegression()
model.fit(X_trains, y_trains)

In [None]:
ytrain_predict = model.predict(X_trains)
ytest_predict = model.predict(X_tests)

In [None]:
ytest_predict_prob=model.predict_proba(X_tests)
pd.DataFrame(ytest_predict_prob).head()

In [None]:
model.score(X_trains, y_trains)


In [None]:
model.score(X_tests, y_tests)


In [None]:
grid={'penalty':['l2'],
      'solver':['saga'],
      'max_iter':[1000,100],
      'tol':[0.00001,0.001]
     }

In [None]:
model1 = LogisticRegression()


In [None]:
grid_search = GridSearchCV(estimator = model1, param_grid = grid, cv = 7,n_jobs=-1,scoring='accuracy')


In [None]:
grid_search.fit(X_trains, y_trains)


In [None]:
print(grid_search.best_params_,'\n')
print(grid_search.best_estimator_)

In [None]:
best_model = grid_search.best_estimator_


In [None]:
ytrain_predictlr1 = best_model.predict(X_trains)
ytest_predictlr1 = best_model.predict(X_tests)

In [None]:
ytrain_predict_prob1=best_model.predict_proba(X_trains)
pd.DataFrame(ytrain_predict_prob1).head()

In [None]:
ytest_predict_prob1=best_model.predict_proba(X_tests)
pd.DataFrame(ytest_predict_prob1).head()

In [None]:
# Accuracy - Training Data
# Training Accuracy
accuracy_lr_train= best_model.score(X_trains, y_trains)
accuracy_lr_train

In [None]:
accuracy_lr_test= best_model.score(X_tests, y_tests)
accuracy_lr_test

In [None]:
print("The intercept for the model is :",best_model.intercept_)


In [None]:
for idx, col_name in enumerate(X_trains.columns):
    print("The coefficient for {} is {}".format(col_name, best_model.coef_[0][idx]))

In [None]:
#Feature Importance Graphs
importance = best_model.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
# concatenate X and y into a single dataframe
data_train = pd.concat([X_trains, y_trains], axis=1)
data_test=pd.concat([X_tests,y_tests],axis=1)
data_train.head()

In [None]:
#Statsmodels is a Python module which provides various functions for estimating different statistical models and performing statistical tests
#first, we define the set of dependent(y) and independent(X) variables.  If the dependent variable is in non-numeric form, it is first converted to numeric using encoding
import statsmodels.api as sm 

In [None]:
log_reg = sm.Logit(y_trains, X_trains).fit()


In [None]:
print(log_reg.summary()) 


LDA

In [None]:
# Linear Discriminant Analysis(LDA) is a dimensionality reduction technique which is commonly used for the supervised classification problems. 
#It is used for modeling differences in groups i.e. separating two or more classes. It is used to project the features in higher dimension space into a lower dimension space.
# library used in LDA is sklearn

In [None]:
clf = LinearDiscriminantAnalysis()
model_lda=clf.fit(X_trains,y_trains)

In [None]:
# Training Data Class Prediction with a cut-off value of 0.5
pred_class_train_lda = model_lda.predict(X_trains)

# Test Data Class Prediction with a cut-off value of 0.5
pred_class_test_lda = model_lda.predict(X_tests)

In [None]:
ytrain_predict_problda1=model_lda.predict_proba(X_trains)
pd.DataFrame(ytrain_predict_problda1).head()


In [None]:
ytest_predict_problda1=model_lda.predict_proba(X_tests)
pd.DataFrame(ytest_predict_problda1).head()


In [None]:
# Accuracy - Training Data
# Training Accuracy
accuracy_lda_train= model_lda.score(X_trains, y_trains)
accuracy_lda_train

In [None]:
accuracy_lda_test= model_lda.score(X_tests, y_tests)
accuracy_lda_test

In [None]:
grid_lda={
      'solver':['svd', 'lsqr', 'eigen'],
      
     }

In [None]:
model_lda = LinearDiscriminantAnalysis()


In [None]:
grid_search_lda = GridSearchCV(estimator = model_lda, param_grid = grid_lda, cv = 7,n_jobs=-1,scoring='accuracy')


In [None]:
grid_search_lda.fit(X_trains, y_trains)


In [None]:
print(grid_search_lda.best_params_,'\n')
print(grid_search_lda.best_estimator_)

In [None]:
best_model_lda = grid_search_lda.best_estimator_


In [None]:
ytrain_predictlda = best_model_lda.predict(X_trains)
ytest_predictlda = best_model_lda.predict(X_tests)

In [None]:
ytrain_predictlda


In [None]:
ytrain_predict_problda=best_model_lda.predict_proba(X_trains)
pd.DataFrame(ytrain_predict_problda).head()

In [None]:
ytest_predict_problda=best_model_lda.predict_proba(X_tests)
pd.DataFrame(ytest_predict_problda).head()

In [None]:
# Accuracy - Training Data
# Training Accuracy
accuracy_lda_train= best_model_lda.score(X_trains, y_trains)
accuracy_lda_train

In [None]:
accuracy_lda_test= best_model_lda.score(X_tests, y_tests)
accuracy_lda_test

# 1.5 Apply KNN Model and Naïve Bayes Model. Interpret the results.


In [None]:
from sklearn.neighbors import KNeighborsClassifier

KNN_model=KNeighborsClassifier(n_neighbors = 15)
KNN_model.fit(X_trains,y_trains)

In [None]:
KNN_model.score(X_trains,y_trains)


In [None]:
KNN_model.score(X_tests,y_tests)


In [None]:
# empty list that will hold accuracy scores
ac_scores = []

# perform accuracy metrics for values from 1,3,5....19
for k in range(1,20,2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_trains, y_trains)
    # evaluate test accuracy
    scores = knn.score(X_tests, y_tests)
    ac_scores.append(scores)

# changing to misclassification error
MCE = [1 - x for x in ac_scores]
MCE

Plot misclassification error vs k (with k value on X-axis) using matplotlib


In [None]:
import matplotlib.pyplot as plt
# plot misclassification error vs k
plt.plot(range(1,20,2), MCE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.show()

In [None]:
## Performance Matrix on train data set
y_train_predict_knn = KNN_model.predict(X_trains)
acc_train_knn = KNN_model.score(X_trains, y_trains)
print(acc_train_knn)
print(metrics.confusion_matrix(y_trains, y_train_predict_knn))
print(metrics.classification_report(y_trains, y_train_predict_knn))

In [None]:
## Performance Matrix on test data set
y_test_predict_knn = KNN_model.predict(X_tests)
acc_test_knn = KNN_model.score(X_tests, y_tests)
print(acc_test_knn)
print(metrics.confusion_matrix(y_tests, y_test_predict_knn))
print(metrics.classification_report(y_tests, y_test_predict_knn))

In [None]:
ytrain_predict_prob_knn=KNN_model.predict_proba(X_trains)
pd.DataFrame(ytrain_predict_prob_knn).head()

In [None]:
ytest_predict_prob_knn=KNN_model.predict_proba(X_tests)
pd.DataFrame(ytest_predict_prob_knn).head()

In [None]:
from sklearn.neighbors import KNeighborsClassifier


In [None]:
grid_knn={'n_neighbors':[ 5,7,10,15],
      'weights': ['uniform','distance'],
      'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']      
      }

In [None]:
model_knn = KNeighborsClassifier()


In [None]:
grid_search_knn = GridSearchCV(estimator = model_knn, param_grid = grid_knn,n_jobs=-1, cv = 7,scoring='accuracy')


In [None]:
grid_search_knn.fit(X_trains, y_trains)


In [None]:
print(grid_search_knn.best_params_,'\n')


In [None]:
print(grid_search_knn.best_estimator_)


In [None]:
grid_search_knn.score(X_trains,y_trains)


In [None]:
grid_search_knn.score(X_tests,y_tests)


Gaussian Naive Bayes


In [None]:
from sklearn.naive_bayes import GaussianNB


In [None]:
NB_model = GaussianNB()
NB_model.fit(X_trains, y_trains)

In [None]:
y_train_predict_nb = NB_model.predict(X_trains)
acc_train_nb = NB_model.score(X_trains, y_trains)                      ## Accuracy
print(acc_train_nb)
print(metrics.confusion_matrix(y_trains, y_train_predict_nb))          ## confusion_matrix
print(metrics.classification_report(y_trains, y_train_predict_nb))     ## classification_report


In [None]:
## Performance Matrix on test data set
y_test_predict_nb = NB_model.predict(X_tests)
acc_test_nb = NB_model.score(X_tests, y_tests)                    ## Accuracy
print(acc_test_nb)
print(metrics.confusion_matrix(y_tests, y_test_predict_nb))         ## confusion_matrix
print(metrics.classification_report(y_tests, y_test_predict_nb))    ## classification_report

In [None]:
ytrain_predict_prob_nb=NB_model.predict_proba(X_trains)
pd.DataFrame(ytrain_predict_prob_nb).head()

In [None]:
ytest_predict_prob_nb=NB_model.predict_proba(X_tests)
pd.DataFrame(ytest_predict_prob_nb).head()

1.6 Model Tuning, Bagging (Random Forest should be applied for Bagging), and Boosting.

Bagging with randomforest


In [None]:
model_rf=RandomForestClassifier(max_depth=4, max_features=5, min_samples_leaf=25,
                       min_samples_split=50, n_estimators=101)
model_bag=BaggingClassifier(base_estimator=model_rf,n_estimators=101,random_state=1)
model_bag.fit(X_trains, y_trains)

In [None]:
y_train_predict_bag = model_bag.predict(X_trains)


In [None]:
y_test_predict_bag = model_bag.predict(X_tests)


In [None]:
acc_bag_train =model_bag.score(X_trains, y_trains)
acc_bag_train

In [None]:
acc_bag_test =model_bag.score(X_tests, y_tests)
acc_bag_test

In [None]:
ytrain_predict_prob_bag=model_bag.predict_proba(X_trains)
pd.DataFrame(ytrain_predict_prob_bag).head()

In [None]:
ytest_predict_prob_bag=model_bag.predict_proba(X_tests)
pd.DataFrame(ytest_predict_prob_bag).head()

AdaBoostClassifier


In [None]:
from sklearn.ensemble import AdaBoostClassifier


ADa_model = AdaBoostClassifier(n_estimators=100,random_state=1, learning_rate=1.0, algorithm='SAMME.R')
ADa_model.fit(X_trains,y_trains)

In [None]:
y_train_predict_Ada = ADa_model.predict(X_trains)


In [None]:
y_test_predict_Ada = ADa_model.predict(X_tests)


In [None]:
ADa_train =ADa_model.score(X_trains, y_trains)
ADa_train

In [None]:
ADa_test=ADa_model.score(X_tests, y_tests)
ADa_test

In [None]:
grid_ADa={'n_estimators':[51,101,151],"learning_rate":[1.0,0.001,0.01,0.2,0.3], "algorithm":["SAMME.R"]}


In [None]:
model_ADa = AdaBoostClassifier()


In [None]:
grid_search_ADa = GridSearchCV(estimator = model_ADa, param_grid = grid_ADa, cv = 7,n_jobs=-1,scoring='accuracy')


In [None]:
grid_search_ADa.fit(X_trains, y_trains)


In [None]:
print(grid_search_ADa.best_params_,'\n')
print(grid_search_ADa.best_estimator_)

In [None]:
best_model_ADa = grid_search_ADa.best_estimator_


In [None]:
ytrain_predict_ADa = best_model_ADa.predict(X_trains)
ytest_predict_ADa = best_model_ADa.predict(X_tests)

In [None]:
ytrain_predict_prob_ADa=best_model_ADa.predict_proba(X_trains)
pd.DataFrame(ytrain_predict_prob_ADa).head()

In [None]:
ytest_predict_prob_ADa=best_model_ADa.predict_proba(X_tests)
pd.DataFrame(ytest_predict_prob_ADa).head()

In [None]:
# Accuracy - Training Data
# Training Accuracy
accuracy_ADa_train= best_model_ADa.score(X_trains, y_trains)
accuracy_ADa_train

In [None]:
accuracy_ADa_test= best_model_ADa.score(X_tests, y_tests)
accuracy_ADa_test

In [None]:
ytrain_predict_prob_ada=best_model_ADa.predict_proba(X_trains)
pd.DataFrame(ytrain_predict_prob_ada).head()

In [None]:
ytrain_predict_prob_ada=best_model_ADa.predict_proba(X_tests)
pd.DataFrame(ytrain_predict_prob_ada).head()

Gradient Boosting


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(criterion = 'friedman_mse',loss='exponential',max_features=5,
                                min_samples_split=50,n_estimators=101,random_state=1)
gbcl = gbcl.fit(X_trains, y_trains)

In [None]:
y_train_predict_gbcl = gbcl.predict(X_trains)


In [None]:
y_test_predict_gbcl = gbcl.predict(X_tests)


In [None]:
gbcl.score(X_trains, y_trains)


In [None]:
gbcl.score(X_tests, y_tests)


In [None]:
grid_gbcl={"criterion" : ['friedman_mse',"mse"],"loss":['exponential'],"max_features":[5,6,7],
                                "min_samples_split":[50,40,30],"n_estimators":[101,51]}


In [None]:
model_gbcl = GradientBoostingClassifier()


In [None]:
grid_search_gbcl = GridSearchCV(estimator = model_gbcl, param_grid = grid_gbcl, cv = 7,n_jobs=-1,scoring='accuracy')


In [None]:
grid_search_gbcl.fit(X_trains, y_trains)


In [None]:

print(grid_search_gbcl.best_params_,'\n')
print(grid_search_gbcl.best_estimator_)

In [None]:
print(grid_search_gbcl.best_params_,'\n')
print(grid_search_gbcl.best_estimator_)

In [None]:
best_model_gbcl = grid_search_gbcl.best_estimator_


In [None]:
ytrain_predict_gbcl = best_model_gbcl.predict(X_trains)
ytest_predict_gbcl = best_model_gbcl.predict(X_tests)


In [None]:
ytrain_predict_prob_gbcl=best_model_gbcl.predict_proba(X_trains)
pd.DataFrame(ytrain_predict_prob_gbcl).head()

In [None]:
ytest_predict_prob_gbcl=best_model_gbcl.predict_proba(X_tests)
pd.DataFrame(ytest_predict_prob_gbcl).head()

In [None]:
# Accuracy - Training Data
# Training Accuracy
accuracy_gbcl_train= best_model_gbcl.score(X_trains, y_trains)
accuracy_gbcl_train

In [None]:
accuracy_gbcl_test= best_model_gbcl.score(X_tests, y_tests)
accuracy_gbcl_test

1.7 Performance Metrics: Check the performance of Predictions on Train and Test sets using Accuracy, Confusion Matrix, Plot ROC curve and get ROC_AUC score for each model. Final Model: Compare the models and write inference which model is best/optimized.

Logistic Regression Model¶


In [None]:
from sklearn.metrics import roc_auc_score,roc_curve,classification_report,confusion_matrix,plot_confusion_matrix

In [None]:
from sklearn import metrics,model_selection
from sklearn.preprocessing import scale

In [None]:
f,a =  plt.subplots(1,2,sharex=True,sharey=True,squeeze=False)

#Plotting confusion matrix for the different models for the Training Data

plot_0 = sns.heatmap((metrics.confusion_matrix(y_trains,ytrain_predictlr1)),annot=True,fmt='.5g',cmap='RdYlGn',ax=a[0][0]);
a[0][0].set_title('Training Data')

plot_1 = sns.heatmap((metrics.confusion_matrix(y_tests,ytest_predictlr1)),annot=True,fmt='.5g',cmap='RdYlGn',ax=a[0][1]);
a[0][1].set_title('Test Data');

In [None]:
print(classification_report(y_trains, ytrain_predictlr1),'\n');
print(classification_report(y_tests, ytest_predictlr1),'\n');

In [None]:
# AUC and ROC for the training data

# calculate AUC
train_auc_lr = metrics.roc_auc_score(y_trains,ytrain_predict_prob1[:,1])
print('AUC for the Training Data: %.3f' % train_auc_lr)

#  calculate roc curve
fpr, tpr, thresholds = metrics.roc_curve(y_trains,ytrain_predict_prob1[:,1])
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.',label = 'Training Data')


# AUC and ROC for the test data

# calculate AUC
test_auc_lr = metrics.roc_auc_score(y_tests,ytest_predict_prob1[:,1])
print('AUC for the Test Data: %.3f' %test_auc_lr )

#  calculate roc curve
fpr, tpr, thresholds = metrics.roc_curve(y_tests,ytest_predict_prob1[:,1])
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.',label='Test Data')
# show the plot
plt.legend(loc='best')
plt.show()

In [None]:
metrics=classification_report(y_trains, ytrain_predictlr1,output_dict=True)
df=pd.DataFrame(metrics).transpose()
train_precision_lr=round(df.loc["1"][0],2)
train_recall_lr=round(df.loc["1"][1],2)
train_f1_lr=round(df.loc["1"][2],2)
print ('LR_train_precision ',train_precision_lr)
print ('LR_train_recall ',train_recall_lr)
print ('LR_train_f1 ',train_f1_lr)

In [None]:
metrics=classification_report(y_tests, ytest_predictlr1,output_dict=True)
df=pd.DataFrame(metrics).transpose()
test_precision_lr=round(df.loc["1"][0],2)
test_recall_lr=round(df.loc["1"][1],2)
test_f1_lr=round(df.loc["1"][2],2)
print ('LR_test_precision ',test_precision_lr)
print ('LR_test_recall ',test_recall_lr)
print ('LR_test_f1 ',test_f1_lr)

LinearDiscriminantAnalysis


In [None]:
from sklearn.metrics import roc_auc_score,roc_curve,classification_report,confusion_matrix,plot_confusion_matrix
from sklearn import metrics,model_selection
from sklearn.preprocessing import scale

In [None]:
metrics=classification_report(y_trains,pred_class_train_lda,output_dict=True)
df=pd.DataFrame(metrics).transpose()
train_precision_lda=round(df.loc["1"][0],2)
train_recall_lda=round(df.loc["1"][1],2)
train_f1_lda=round(df.loc["1"][2],2)
print ('LDA_train_precision ',train_precision_lda)
print ('LDA_train_recall ',train_recall_lda)
print ('LDA_train_f1 ',train_f1_lda)

In [None]:
metrics=classification_report(y_tests,pred_class_test_lda,output_dict=True)
df=pd.DataFrame(metrics).transpose()
test_precision_lda=round(df.loc["1"][0],2)
test_recall_lda=round(df.loc["1"][1],2)
test_f1_lda=round(df.loc["1"][2],2)
print ('LDA_test_precision ',test_precision_lda)
print ('LDA_test_recall ',test_recall_lda)
print ('LDA_test_f1 ',test_f1_lda)

In [None]:
confusion_matrix(y_trains,pred_class_train_lda)
sns.heatmap(confusion_matrix(y_trains,pred_class_train_lda),annot=True, fmt='d',cbar=False, cmap='cividis')
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
confusion_matrix(y_tests,pred_class_test_lda)
sns.heatmap(confusion_matrix(y_tests,pred_class_test_lda),annot=True, fmt='d',cbar=False, cmap='cividis')
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print(classification_report(y_trains, pred_class_train_lda),'\n');
print(classification_report(y_tests,pred_class_test_lda),'\n');

KNN Model


In [None]:
metrics=classification_report(y_trains,y_train_predict_knn,output_dict=True)
df=pd.DataFrame(metrics).transpose()
train_precision_knn=round(df.loc["1"][0],2)
train_recall_knn=round(df.loc["1"][1],2)
train_f1_knn=round(df.loc["1"][2],2)
print ('knn_train_precision ',train_precision_knn)
print ('knn_train_recall ',train_recall_knn)
print ('knn_train_f1 ',train_f1_knn)

In [None]:
metrics=classification_report(y_tests,y_test_predict_knn,output_dict=True)
df=pd.DataFrame(metrics).transpose()
test_precision_knn=round(df.loc["1"][0],2)
test_recall_knn=round(df.loc["1"][1],2)
test_f1_knn=round(df.loc["1"][2],2)
print ('knn_test_precision ',test_precision_knn)
print ('knn_test_recall ',test_recall_knn)
print ('knn_test_f1 ',test_f1_knn)

In [None]:
print(classification_report(y_trains,y_train_predict_knn),'\n');
print(classification_report(y_tests,y_test_predict_knn),'\n');

naive model


In [None]:
metrics=classification_report(y_trains,y_train_predict_nb,output_dict=True)
df=pd.DataFrame(metrics).transpose()
train_precision_nb=round(df.loc["1"][0],2)
train_recall_nb=round(df.loc["1"][1],2)
train_f1_nb=round(df.loc["1"][2],2)
print ('nb_train_precision ',train_precision_nb)
print ('nb_train_recall ',train_recall_nb)
print ('nb_train_f1 ',train_f1_nb)

In [None]:
metrics=classification_report(y_tests,y_test_predict_nb,output_dict=True)
df=pd.DataFrame(metrics).transpose()
test_precision_nb=round(df.loc["1"][0],2)
test_recall_nb=round(df.loc["1"][1],2)
test_f1_nb=round(df.loc["1"][2],2)
print ('nb_test_precision ',test_precision_nb)
print ('nb_test_recall ',test_recall_nb)
print ('nb_test_f1 ',test_f1_nb)

In [None]:
print(classification_report(y_trains,y_train_predict_nb),'\n');
print(classification_report(y_tests,y_test_predict_nb),'\n');

Bagging with randomforest


In [None]:
metrics=classification_report(y_trains,y_train_predict_bag,output_dict=True)
df=pd.DataFrame(metrics).transpose()
train_precision_bag=round(df.loc["1"][0],2)
train_recall_bag=round(df.loc["1"][1],2)
train_f1_bag=round(df.loc["1"][2],2)
print ('bag_train_precision ',train_precision_bag)
print ('bag_train_recall ',train_recall_bag)
print ('bag_train_f1 ',train_f1_bag)

In [None]:
metrics=classification_report(y_tests,y_test_predict_bag,output_dict=True)
df=pd.DataFrame(metrics).transpose()
test_precision_bag=round(df.loc["1"][0],2)
test_recall_bag=round(df.loc["1"][1],2)
test_f1_bag=round(df.loc["1"][2],2)
print ('bag_test_precision ',test_precision_bag)
print ('bag_test_recall ',test_recall_bag)
print ('bag_test_f1 ',test_f1_bag)

In [None]:
print(classification_report(y_trains,y_train_predict_bag),'\n');
print(classification_report(y_tests,y_test_predict_bag),'\n');

AdaBoostClassifier


In [None]:
print(classification_report(y_trains, ytrain_predict_ADa),'\n');
print(classification_report(y_tests, ytest_predict_ADa),'\n');

In [None]:
metrics=classification_report(y_trains, ytrain_predict_ADa,output_dict=True)
df=pd.DataFrame(metrics).transpose()
train_precision_ADa=round(df.loc["1"][0],2)
train_recall_ADa=round(df.loc["1"][1],2)
train_f1_ADa=round(df.loc["1"][2],2)
print ('ADa_train_precision ',train_precision_ADa)
print ('ADa_train_recall ',train_recall_ADa)
print ('ADa_train_f1 ',train_f1_ADa)

In [None]:
metrics=classification_report(y_tests, ytest_predict_ADa,output_dict=True)
df=pd.DataFrame(metrics).transpose()
test_precision_ADa=round(df.loc["1"][0],2)
test_recall_ADa=round(df.loc["1"][1],2)
test_f1_ADa=round(df.loc["1"][2],2)
print ('ADa_test_precision ',test_precision_ADa)
print ('ADa_test_recall ',test_recall_ADa)
print ('ADatest_f1 ',test_f1_ADa)

Gradient Boosting


In [None]:
print(classification_report(y_trains, ytrain_predict_gbcl),'\n');
print(classification_report(y_tests, ytest_predict_gbcl),'\n');

In [None]:
metrics=classification_report(y_trains, ytrain_predict_gbcl,output_dict=True)
df=pd.DataFrame(metrics).transpose()
train_precision_gbcl=round(df.loc["1"][0],2)
train_recall_gbcl=round(df.loc["1"][1],2)
train_f1_gbcl=round(df.loc["1"][2],2)
print ('gbcl_train_precision ',train_precision_gbcl)
print ('gbcl_train_recall ',train_recall_gbcl)
print ('gbcl_train_f1 ',train_f1_gbcl)

In [None]:
metrics=classification_report(y_tests, ytest_predict_gbcl,output_dict=True)
df=pd.DataFrame(metrics).transpose()
test_precision_gbcl=round(df.loc["1"][0],2)
test_recall_gbcl=round(df.loc["1"][1],2)
test_f1_gbcl=round(df.loc["1"][2],2)
print ('gbcl_test_precision ',test_precision_gbcl)
print ('gbcl_test_recall ',test_recall_gbcl)
print ('gbcl_test_f1 ',test_f1_gbcl)

In [None]:
# Sample voter information
test_sample = [(4,6,10,8,20,11,7,21)]
#Create a DataFrame object
test_df = pd.DataFrame(test_sample, columns = ['age', 'economic.cond.national', 'economic.cond.household',
       'Blair', 'Hague', 'Europe', 'political_knowledge', 'gender'])

In [None]:
#Labour = 1 , conservative = 0

test_predict_lr =  best_model.predict(test_df)
test_predict_lda = best_model_lda.predict(test_df)
test_predict_knn = KNN_model.predict(test_df) 
test_predict_nb = NB_model.predict(test_df)
test_predict_bag = model_bag.predict(test_df)
test_predict_ada = ADa_model.predict(test_df)
test_predict_gb = best_model_ADa.predict(test_df)

In [None]:
outcome_lr = pd.DataFrame(test_predict_lr)
outcome_lr[0]=np.where(outcome_lr[0] == 1," labour party","conservative party")
outcome_lda = pd.DataFrame(test_predict_lda)
outcome_lda[0]=np.where(outcome_lda[0] == 1," labour party","conservative party")
outcome_knn = pd.DataFrame(test_predict_knn)
outcome_knn[0]=np.where(outcome_knn[0] == 1,"labour party","conservative party")
outcome_nb = pd.DataFrame(test_predict_nb)
outcome_nb[0]=np.where(outcome_nb[0] == 1,"labour party","conservative party")
outcome_bag = pd.DataFrame(test_predict_bag)
outcome_bag[0]=np.where(outcome_bag[0] == 1,"labour party","conservative party")
outcome_ada = pd.DataFrame(test_predict_ada)
outcome_ada[0]=np.where(outcome_ada[0] == 1,"labour party","conservative party")
outcome_gb = pd.DataFrame(test_predict_gb)
outcome_gb[0]=np.where(outcome_gb[0] == 1,"labour party","conservative party")

In [None]:
outcome = [outcome_lr,outcome_lda,outcome_knn,outcome_nb,outcome_bag,
               outcome_ada,outcome_gb]
outcome1 = pd.concat(outcome)
outcome1[" Prediction"] = outcome1.copy()
outcome1.drop(0,axis=1,inplace=True)
Models = ['Logistic Regression', 'Linear Discriminant Analaysis ', 'K-Nearest Neighbour',
              'Naive Bayes','Bagging(with Random Forest)', 'Adaptive Boosting ',
              'Gradient Boosting']
outcome1["Model"] = Models
outcome1.set_index("Model", inplace = True)

In [None]:
#Printing the results
print("For the sample voter details provided, the following conclusions are made by the models:\n",outcome1)

In [None]:
outcome1


1.8 Based on these predictions, what are the insights?


In [None]:
#Explained in the report

# Problem 2:
In this particular project, we are going to work on the inaugural corpora from the nltk in Python. We will be looking at the following speeches of the Presidents of the United States of America:
President Franklin D. Roosevelt in 1941
President John F. Kennedy in 1961
President Richard Nixon in 1973

In [None]:
import nltk
nltk.download('inaugural')
from nltk.corpus import inaugural
inaugural.fileids()

2.1 Find the number of characters, words, and sentences for the mentioned documents.

In [None]:
Roosevelt_raw = inaugural.raw('1941-Roosevelt.txt')
Kennedy_raw = inaugural.raw('1961-Kennedy.txt')
Nixon_raw =inaugural.raw('1973-Nixon.txt')

In [None]:
print("The number of characters in Roosevelt speech are:",len(Roosevelt_raw))
print("The number of characters in Kennedy speech are:",len(Kennedy_raw))
print("The number of characters in Nixon speech are:",len(Nixon_raw))

In [None]:
Roosevelt_words = inaugural.words('1941-Roosevelt.txt')
Kennedy_words = inaugural.words('1961-Kennedy.txt')
Nixon_words =inaugural.words('1973-Nixon.txt')

In [None]:
print("The number of Words in Roosevelt speech are:",len(inaugural.words('1941-Roosevelt.txt')))
print("The number of Words in Kennedy speech are:",len(inaugural.words('1961-Kennedy.txt')))
print("The number of Words in Nixon speech are:",len(inaugural.words('1973-Nixon.txt')))

In [None]:
print("Most common words in Roosevelt speech:",nltk.FreqDist(Roosevelt_words).most_common(10))


In [None]:
print("Most common words in Kennedy speech:",nltk.FreqDist(Kennedy_words).most_common(10))


In [None]:
print("Most common words in Nixon speech:",nltk.FreqDist(Nixon_words).most_common(10))


In [None]:
import nltk
nltk.download('punkt')

In [None]:
print("The number of sentences in Roosevelt speech are:",len(inaugural.sents('1941-Roosevelt.txt')))
print("The number of sentences in Kennedy speech are:",len(inaugural.sents('1961-Kennedy.txt')))
print("The number of sentences in Nixon speech are:",len(inaugural.sents('1973-Nixon.txt')))

2.2 Remove all the stopwords from all three speeches


In [None]:
import random
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
st = PorterStemmer()

import json
import re
import string
import pandas as pd

In [None]:
# Roosevelt speech
stopwords = nltk.corpus.stopwords.words('english') +list(string.punctuation)

stopwords.extend(["--"]) 
all_words = (w.lower() for w in inaugural.words('1941-Roosevelt.txt'))
all_words_clean = [word for word in all_words if word not in stopwords]

clean_stem = [st.stem(word) for word in all_words_clean]

Roosevelt_words_freq = nltk.FreqDist(clean_stem)

word_features1 = [item[0] for item in Roosevelt_words_freq.most_common(50)]

In [None]:
print("Most common words in Roosevelt speech after removing stopwords")
print(word_features1)

In [None]:
stopwords = nltk.corpus.stopwords.words('english') +list(string.punctuation)
stopwords.extend(["--"]) 

all_words = (w.lower() for w in inaugural.words('1961-Kennedy.txt'))
all_words_clean = [word for word in all_words if word not in stopwords]
clean_stem = [st.stem(word) for word in all_words_clean]


kennedy_words_freq = nltk.FreqDist(clean_stem)

word_features2 = [item[0] for item in kennedy_words_freq.most_common(50)]

In [None]:
word_features2


In [None]:
print("Most common words in Kennedy speech after removing stopwords")
print(word_features2)

In [None]:
stopwords = nltk.corpus.stopwords.words('english') +list(string.punctuation)
stopwords.extend(["--"]) 

all_words = (w.lower() for w in inaugural.words('1973-Nixon.txt'))
all_words_clean = [word for word in all_words if word not in stopwords]
clean_stem = [st.stem(word) for word in all_words_clean]


Nixon_words_freq = nltk.FreqDist(clean_stem)

word_features3 = [item[0] for item in Nixon_words_freq.most_common(50)]

In [None]:
print("Most common words in Nixon speech after removing stopwords")
print(word_features3)

2.3 Which word occurs the most number of times in his inaugural address for each president? Mention the top three words. (after removing the stopwords)

In [None]:
print("Top three words in Roosevelt's speech(after removing the stopwords):",nltk.FreqDist(Roosevelt_words_freq).most_common(3))


In [None]:
print("Top three words in Kennedy's speech(after removing the stopwords):",nltk.FreqDist(kennedy_words_freq).most_common(3))


In [None]:
print("Top three words in Nixon's speech(after removing the stopwords):",nltk.FreqDist(Nixon_words_freq).most_common(3))


2.4 Plot the word cloud of each of the speeches of the variable. (after removing the stopwords)

In [None]:
Roosevelt_df = pd.DataFrame({"Roosevelt":[inaugural.raw('1941-Roosevelt.txt')]})
Kennedy_df =  pd.DataFrame({"Kennedy":[inaugural.raw('1961-Kennedy.txt')]})
Nixon_df =  pd.DataFrame({"Nixon":[inaugural.raw('1973-Nixon.txt')]})
import matplotlib.pyplot as plt

In [None]:
from nltk.corpus import stopwords

Roosevelt speech

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(["--"])

In [None]:

corpus = Roosevelt_df['Roosevelt'].apply(lambda x: ' '.join([z for z in x.split() if z not in stop_words])) 
corpus

In [None]:
wc_a = ' '.join(corpus)


In [None]:
conda install -c conda-forge word cloud

In [None]:
from wordcloud import WordCloud

In [None]:
# Word Cloud 
wordcloud = WordCloud(width = 3000, height = 3000, 
                background_color ='skyblue', 
                min_font_size = 10, random_state=100).generate(wc_a) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off")
plt.xlabel('Word Cloud')
plt.tight_layout(pad = 0) 

print("Word Cloud  (after cleaning)!!")
plt.show()

Kennedy speech


In [None]:
corpus2 = Kennedy_df['Kennedy'].apply(lambda x: ' '.join([z for z in x.split() if z not in stop_words])) 
corpus2

In [None]:
wc_a1 = ' '.join(corpus2)


In [None]:
# Word Cloud 
from wordcloud import WordCloud
wordcloud = WordCloud(width = 3000, height = 3000, 
                background_color ='black', 
                min_font_size = 10, random_state=100).generate(wc_a1) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off")
plt.xlabel('Word Cloud')
plt.tight_layout(pad = 0) 

print("Word Cloud  (after cleaning)!!")
plt.show()

In [None]:
# Nixon speech


In [None]:
corpus3 = Nixon_df['Nixon'].apply(lambda x: ' '.join([z for z in x.split() if z not in stop_words])) 
corpus3

In [None]:
# Word Cloud 
from wordcloud import WordCloud
wordcloud = WordCloud(width = 3000, height = 3000, 
                background_color ='pink', 
                min_font_size = 10, random_state=100).generate(wc_a3) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off")
plt.xlabel('Word Cloud')
plt.tight_layout(pad = 0) 

print("Word Cloud  (after cleaning)!!")
plt.show()