In [1]:
import warnings
warnings.filterwarnings('ignore')

# 1. Data Preprocessing

In [2]:
import pandas as pd
pro = open('ProsCons/IntegratedPros.txt')
pro_data = pro.readlines()
pro.close()

con = open('ProsCons/IntegratedCons.txt')
con_data = con.readlines()
con.close()

In [3]:
pro_data = [w.replace('<Pros>', '') for w in pro_data]
pro_data = [w.replace('</Pros>\n', '') for w in pro_data]

con_data = [w.replace('<Cons>', '') for w in con_data]
con_data = [w.replace('</Cons>\n', '') for w in con_data]

In [4]:
df_pro = pd.DataFrame({'Review':pro_data, 'Label':1})
df_con = pd.DataFrame({'Review':con_data, 'Label':0})

In [5]:
dataset = pd.concat([df_pro, df_con], ignore_index  = True)

In [6]:
dataset.head()

Unnamed: 0,Label,Review
0,1,"Easy to use, economical!"
1,1,Digital is where it's at...down with developin...
2,1,"Good image quality, 3x optical zoom, macro mod..."
3,1,Awesome features/easy to use/fun/versatile/low...
4,1,"intuitive, user friendly"


### 1.1 Cleaning the Texts

In [7]:
# punctuations, steming, upper/lower case, irrelevant words(the, on etc), tokenization

import re 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
for i in range(len(dataset)):
    # text cleaning
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # removing punctuations, numbers(only keeping
                                                              # letter from a-z or A-Z)
    review = review.lower() # converting to lowercase

    # removing the irrelevant texts
    review = review.split()
    # review = [word for word in review if not word in set(stopwords.words('english'))]

    # steming
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ishwor.Bhatta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1.2 Creating a Bag of Words Model

In [8]:
# creating a sparse matrix with unique words from corpus
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 0].values

### 1.3 Splitting the dataset into the Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# 2. Using Random Forest Classifier

### 2.1 Fitting Random Forest Classification to the Training set

In [11]:
from sklearn.ensemble import RandomForestClassifier
rcf_classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0)
rcf_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

### 2.2 Predicting the Test set results

In [12]:
y_pred_rcf = rcf_classifier.predict(X_test)

### 2.3 Confusion Matrix

In [13]:
from sklearn.metrics import confusion_matrix
cm_rfc = confusion_matrix(y_test, y_pred_rcf)

### 2.4 Performance Measure of Random Forest Classifier

In [14]:
accuracy_rcf = (cm_rfc[0][0] + cm_rfc[1][1])/ sum(sum(cm_rfc))
precision_rcf = cm_rfc[1][1]/(cm_rfc[1][1] + cm_rfc[0][1])
print('Random Forest Performance -----------------')
print('Accuracy: {0:.2f}%'.format(accuracy_rcf * 100))
print('Precision: {0:.2f}%'.format(precision_rcf * 100))

Random Forest Performance -----------------
Accuracy: 89.63%
Precision: 88.94%


# 3. Using Artificial Neural Net Classification Approach

### 3.1 Setting Up ANN Model

In [15]:
# Importing Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

# Initializing the ANN
ann_classifier = Sequential()

# Adding the input layer and the first hidden layer
# we will have 2000 input nodes(for 2000 independent variables)
# The best activation function could be rectifier function for the hidden layer and sigmoid function for the output layer
# We will also be able to find the rank of probability that customer could leave the bank
ann_classifier.add(Dense(output_dim = 1000,# number of nodes in the hidden layer being added. Usual practice is to take an average of number of layers in i/p layer and o/p layers. Or performance tuning by K-fold cross validation 
init = 'uniform' , # initialize the weights to small number close to 0
activation = 'relu', # rectifier activation function for hidden layer
input_dim = 2000 # number of nodes in th i/p layer( # of independent variable)
)) # all the NN parameters are defined here

# Adding additional hidden layer
ann_classifier.add(Dense(output_dim = 1000, init = 'uniform', activation = 'relu')) # input_dim is required only for first hidden layer as the NN model does not know how many nodes are at the input but after first hidden layet, the model knows how many input are there in the following hidden layers

# Adding additional hidden layer
ann_classifier.add(Dense(output_dim = 500, init = 'uniform', activation = 'relu'))

# Adding the output layer
ann_classifier.add(Dense(output_dim = 1, # since we are expecting binary output, we need 1 output node. 
init = 'uniform', 
activation = 'sigmoid' # sigmoid activation function for output layer
# for muti-value output categories, we need output_dim = nimber of categories and activation = 'soft_max'
# soft_max is similar to sigmoid function but applied to dependent varibale that has more than 2 categories
)) 

Using TensorFlow backend.


### 3.2 Compiling the ANN

In [16]:
# Applying stochastic gradient descent in the entire NN
ann_classifier.compile(
optimizer = 'adam', # algorithm to find optimal set of weights for NN
loss = 'binary_crossentropy', # loss function within the stochastic gradient descent algorithm (i.e. in 'adam' algorithm
# binary_crossentropy -> for binary o/p and categorical_crossentropy -> for categorical o/p
metrics = ['accuracy'] # accuracy criterion to evaluate the model
)

### 3.3 Fitting the ANN to the Training set

In [17]:
ann_classifier.fit(X_train, y_train, 
batch_size = 10, # whether to update the weights after each observation or after a batch of observations(backpropagation)
nb_epoch = 50 # defines number of iterations
# for both batch_size and nb_epoch, no optimal value by default. Need to find the best value by experimentation or performance tuning
)



<keras.callbacks.History at 0x186f35c0>

### 3.4 Predicting the Test set results

In [18]:
y_pred_ann = ann_classifier.predict(X_test)

# since the y_pred here is the probabilities instead of binary value, we need to convert these probabilities into a binary value. For this weed to set a threshold to distinguish between 1 and 0.
# for sensitive information we need higher threshold
# let's choose 50% as the threshold here
y_pred_ann = (y_pred_ann > 0.5) # this gives true/false

### 3.5 Confusion Matrix

In [19]:
from sklearn.metrics import confusion_matrix
cm_ann = confusion_matrix(y_test, y_pred_ann)

### 3.6 Performance Measure of ANN Classification Model

In [20]:
accuracy_ann = (cm_ann[0][0] + cm_ann[1][1])/ sum(sum(cm_ann))
precision_ann = cm_ann[1][1]/(cm_ann[1][1] + cm_ann[0][1])
print('Deep Learning Performance -----------------')
print('Accuracy: {0:.2f}%'.format(accuracy_ann * 100))
print('Precision: {0:.2f}%'.format(precision_ann * 100))

Deep Learning Performance -----------------
Accuracy: 88.56%
Precision: 84.81%


# 4. Conclusion

The Deep learning approach did not perform better than Random Forest model with random selection of hyperparameters. We could probably achieve better performance with a bit of hyperparameter tuning. However, it becomes highly coputation intensive to perform tuning with different parameters in the algorithm we used for ANN here. Also, the RF classifier already provided a satisfied performance. Hence, the Random Forest classifier approcah could be implemented in production. 