In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
data=pd.read_csv('projo.csv')
data.head()

In [None]:
print(list(data.columns))

In [None]:
data['SATISFIED'].value_counts()

In [None]:
sns.countplot(x='SATISFIED',data=data,palette='hls')
plt.save_fig('count_plot')



In [None]:
count_no_sub = len(data[data['SATISFIED']=='YES'])
count_sub = len(data[data['SATISFIED']=='NO'])
pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)
print("percentage of non-satisfied students is", pct_of_no_sub*100)
pct_of_sub = count_sub/(count_no_sub+count_sub)
print("percentage of satisfied students", pct_of_sub*100)

In [None]:
data.groupby('SATISFIED').mean()

In [None]:
#Visualizations

%matplotlib inline
pd.crosstab(data.gender,data.SATISFIED).plot(kind='bar')
plt.title('satisfaction frequency by gender')
plt.xlabel('gender')
plt.ylabel('satisfaction level')
plt.savefig('association of gender and satisfaction')

In [None]:
#The frequency of purchase of the deposit depends a great deal on the job title. Thus, the job title can be a good 
#predictor of the outcome variable.

table=pd.crosstab(data.course,data.SATISFIED)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of course vs satisfation')
plt.xlabel('course')
plt.ylabel('satisfaction level')
plt.savefig('course vs satisfaction')

In [None]:
#The gender does not seem a strong predictor for the outcome variable.

table=pd.crosstab(data.L_explain,data.SATISFIED)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of gender vs satisfaction')
plt.xlabel('L_explain')
plt.ylabel('satisfaction_level')
plt.savefig('gender_vs_satisfaction')

In [None]:
data.age.hist()
plt.title('Histogram of lecturer')
plt.xlabel('L_explain')
plt.ylabel('SATISFIED')
plt.savefig('hist_lec')

In [None]:
X = data.loc[:, data.columns != ('SATISFIED','gender','course','age')]
y = data.loc[:, data.columns == 'SATISFIED']



In [None]:
data.head(X)

In [None]:
head(y)

In [None]:

from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns

os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data

print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['SATISFIED']==0]))
print("Number of subscription",len(os_data_y[os_data_y['SATISFIED']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['SATISFIED']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['SATISFIED']==1])/len(os_data_X))



In [None]:
#Recursive Feature Elimination

#Recursive Feature Elimination (RFE) is based on the idea to repeatedly construct a model and choose either the best or worst performing feature, setting the feature aside and then repeating the process with the rest of the features. This process is applied until all features in the dataset are exhausted. The goal of RFE is to select features by recursively considering smaller and smaller sets of features.

data_final_vars=data_final.columns.values.tolist()
y=['y']
X=[i for i in data_final_vars if i not in y]

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

rfe = RFE(logreg, 20)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)


cols=['euribor3m', 'job_blue-collar', 'job_housemaid', 'marital_unknown', 'education_illiterate', 'default_no', 'default_unknown', 
      'contact_cellular', 'contact_telephone', 'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar', 
      'month_may', 'month_nov', 'month_oct', "poutcome_failure", "poutcome_success"] 
X=os_data_X[cols]
y=os_data_y['y']


In [None]:
#Implementing the model

import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

In [None]:
cols=['euribor3m', 'job_blue-collar', 'job_housemaid', 'marital_unknown', 'education_illiterate', 
      'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar', 
      'month_may', 'month_nov', 'month_oct', "poutcome_failure", "poutcome_success"] 
X=os_data_X[cols]
y=os_data_y['y']

logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())



In [None]:
#Logistic Regression Model Fitting

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)



In [None]:
#Predicting the test set results and calculating the accuracy

y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))



In [None]:
#Confusion Matrix

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)



In [None]:
# Compute precision, recall, F-measure and support

# To quote from Scikit Learn:

# The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative.

# The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.

# The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.

# The F-beta score weights the recall more than the precision by a factor of beta. beta = 1.0 means recall and precision are equally important.

# The support is the number of occurrences of each class in y_test.

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


# Interpretation: Of the entire test set, 74% of the promoted term deposit were the term deposit that the customers l
#iked. Of the entire test set, 74% of the customer’s preferred term deposits that were promoted.



In [None]:


# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')

plt.show()

