# <center>CNN With Data Augmentation and Without

In [28]:
import pandas as pd
import os
import numpy as np
import plotly.graph_objects as go
import plotly.express as px


In [29]:
def avgNestedLists(nested_vals):
    """
    Averages a 2-D array and returns a 1-D array of all of the columns
    averaged together, regardless of their dimensions.
    """
    
    output = []
    maximum = 0
    for lst in nested_vals:
        if len(lst) > maximum:
            maximum = len(lst)
    for index in range(maximum): # Go through each index of longest list
        temp = []
        for lst in nested_vals: # Go through each list
            if index < len(lst): # If not an index error
                temp.append(lst[index])
        output.append(np.nanmean(temp))
    return output

In [30]:
# Read the history csv files of training without data augmentation for the CNN model

x = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
i = 0
CNN_NoAug_Results = []
for item in os.listdir('./CNN_NOAUG/'):
    _temp = pd.read_csv('./CNN_NOAUG/' + str(item))
    _temp = list(_temp[str(x[i])])
    CNN_NoAug_Results.append(_temp[-1])
    i+=1
    
CNN_NoAug_Results

[0.5174000263214111,
 0.7728999853134155,
 0.815166711807251,
 0.8565999865531921,
 0.8726332783699036,
 0.8759999871253967,
 0.8808333277702332,
 0.883899986743927,
 0.8869000077247621,
 0.8876500129699707,
 0.8957999944686891,
 0.90259999]

In [31]:
# Read the history csv files of training with data augmentation for the CNN model.
# We are reading the first iteration that was run.

CNN_Aug_Results_1 = []
for item in os.listdir('./CNN_AUG/1/'):
    _temp = pd.read_csv('./CNN_AUG/1/' + str(item))
    _temp = list(_temp['val_accuracy'])
    CNN_Aug_Results_1.append(_temp[-1])
    
CNN_Aug_Results_1

[0.5947999954223633,
 0.8061000108718872,
 0.8432999849319458,
 0.8589000105857849,
 0.8730000257492065,
 0.8769000172615051,
 0.8790000081062317,
 0.880299985408783,
 0.8765000104904175,
 0.8812000155448914,
 0.8827000260353088,
 0.8914999961853027]

In [32]:
# Read the history csv files of training with data augmentation for the CNN model.
# We are reading the second iteration that was run.

CNN_Aug_Results_2 = []
for item in os.listdir('./CNN_AUG/2/'):
    _temp = pd.read_csv('./CNN_AUG/2/' + str(item))
    _temp = list(_temp['val_accuracy'])
    CNN_Aug_Results_2.append(_temp[-1])
    
CNN_Aug_Results_2

[0.5097000002861023,
 0.7986999750137329,
 0.8334000110626221,
 0.8605999946594238,
 0.8705000281333923,
 0.8744999766349792,
 0.8715999722480774,
 0.8787999749183655,
 0.8802000284194946,
 0.8841000199317932,
 0.8869000077247621,
 0.8914999961853027]

In [33]:
# Read the history csv files of training with data augmentation for the CNN model.
# We are reading the third iteration that was run.

CNN_Aug_Results_3 = []
for item in os.listdir('./CNN_AUG/3/'):
    _temp = pd.read_csv('./CNN_AUG/3/' + str(item))
    _temp = list(_temp['val_accuracy'])
    CNN_Aug_Results_3.append(_temp[-1])
    
CNN_Aug_Results_3

[0.5281000137329102,
 0.8294000029563904,
 0.8120999932289124,
 0.8594999909400941,
 0.866100013256073,
 0.8776999711990356,
 0.8769000172615051,
 0.8805999755859375,
 0.878600001335144,
 0.8848000168800354,
 0.88919997215271,
 0.890999972820282]

In [34]:
# Average out the values of all three iterations.

CNN_Aug_Results = [CNN_Aug_Results_1, CNN_Aug_Results_2, CNN_Aug_Results_3]
CNN_Aug_Results = avgNestedLists(CNN_Aug_Results)
CNN_Aug_Results

[0.5442000031471252,
 0.8113999962806702,
 0.8295999964078268,
 0.859666665395101,
 0.869866689046224,
 0.87636665503184,
 0.8758333325386047,
 0.8798999786376953,
 0.878433346748352,
 0.8833666841189066,
 0.8862666686375936,
 0.8913333217302958]

In [36]:
# Plot the validation accuracy of the CNN with and without data augmentation for each
# subset of the dataset.

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=CNN_Aug_Results, name='Validation Acc w Data Augmentation'))
fig.add_trace(go.Scatter(x=x, y=CNN_NoAug_Results, name='Validation Acc w/o Data Augmentation'))

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.15,
    xanchor="left",
    x=0.65
), title=dict(text="Validation Accuracy of CNN With Data Augmentation", y=0.9, x=0.5, xanchor='center', yanchor='top'))
fig.show()

fig.write_html('CNN_DataAugment_Acc.html')
fig.write_image('CNN_DataAugment_Acc.jpeg')

In [37]:
# Read the entire dataset.

data = pd.read_csv('IMDB Dataset.csv')
sentiment_df = data['sentiment'].value_counts()
sentiment_df

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [38]:
# Plot the bar plot for number of positive and negative reviews (EDA)

fig = px.bar(sentiment_df, x='sentiment',
             title='Number of Positive and Negative Reviews')
fig.update_layout(
    yaxis=dict(title='Sentiment'),
    xaxis=dict(title='Count')
)
fig.show()
fig.write_image('Sentiment_EDA.jpeg')

In [39]:
# Read the history of the RNN model trained without data augmentation

x = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
i = 0
RNN_NoAug_Results = []
for item in os.listdir('./RNN_NOAUG/'):
    _temp = pd.read_csv('./RNN_NOAUG/' + str(item))
    _temp = list(_temp[str(x[i])])
    RNN_NoAug_Results.append(_temp[-1])
    i+=1
    
RNN_NoAug_Results

[0.5036000013351439,
 0.4973999857902527,
 0.5051000118255615,
 0.5040000081062317,
 0.4846999943256378,
 0.8314999938011169,
 0.8574000000953674,
 0.8633000254631042,
 0.8528000116348267,
 0.8723000288009644,
 0.8715000152587891,
 0.8830000159999999]

In [40]:
# Read the history of the RNN model trained with data augmentation.
# We are reading the first iteration.

RNN_Aug_Results_1 = []
for item in os.listdir('./RNN_AUG/1/'):
    _temp = pd.read_csv('./RNN_AUG/1/' + str(item))
    _temp = list(_temp['val_accuracy'])
    RNN_Aug_Results_1.append(_temp[-1])
    
RNN_Aug_Results_1

[0.5049999952316284,
 0.7465999722480774,
 0.8177000284194946,
 0.8348000049591064,
 0.8597999811172485,
 0.8639000058174133,
 0.86080002784729,
 0.8695999979972839,
 0.8823999762535095,
 0.8812000155448914,
 0.8784000277519226,
 0.8823000192642212]

In [41]:
# Read the history of the RNN model trained with data augmentation.
# We are reading the second iteration.

RNN_Aug_Results_2 = []
for item in os.listdir('./RNN_AUG/2/'):
    _temp = pd.read_csv('./RNN_AUG/2/' + str(item))
    _temp = list(_temp['val_accuracy'])
    RNN_Aug_Results_2.append(_temp[-1])
    
RNN_Aug_Results_2

[0.5023999810218811,
 0.5138000249862671,
 0.79830002784729,
 0.8424999713897705,
 0.8521000146865845,
 0.8657000064849854,
 0.8669999837875366,
 0.8658000230789185,
 0.8823000192642212,
 0.8787999749183655,
 0.8834999799728394,
 0.8866999745368958]

In [42]:
# Read the history of the RNN model trained with data augmentation.
# We are reading the third iteration.

RNN_Aug_Results_3 = []
for item in os.listdir('./RNN_AUG/3/'):
    _temp = pd.read_csv('./RNN_AUG/3/' + str(item))
    _temp = list(_temp['val_accuracy'])
    RNN_Aug_Results_3.append(_temp[-1])
    
RNN_Aug_Results_3

[0.5020999908447266,
 0.7559999823570251,
 0.8266000151634216,
 0.8357999920845032,
 0.8565000295639038,
 0.5164999961853027,
 0.8708000183105469,
 0.8783000111579895,
 0.8745999932289124,
 0.8776999711990356,
 0.8862000107765198,
 0.881600022315979]

In [43]:
# Averaeg out the results of all three iterations

RNN_Aug_Results = [RNN_Aug_Results_1, RNN_Aug_Results_2, RNN_Aug_Results_3]
RNN_Aug_Results = avgNestedLists(RNN_Aug_Results)
RNN_Aug_Results

[0.503166655699412,
 0.6721333265304565,
 0.8142000238100687,
 0.8376999894777933,
 0.8561333417892456,
 0.7487000028292338,
 0.8662000099817911,
 0.871233344078064,
 0.8797666629155477,
 0.8792333205540975,
 0.8827000061670939,
 0.8835333387056986]

In [44]:
# Plot the validation accuracies of the RNN with and without data augmentation 

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=RNN_Aug_Results, name='Validation Acc w Data Augmentation'))
fig.add_trace(go.Scatter(x=x, y=RNN_NoAug_Results, name='Validation Acc w/o Data Augmentation'))

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.15,
    xanchor="left",
    x=0.65
), title=dict(text="Validation Accuracy of RNN With Data Augmentation", y=0.9, x=0.5, xanchor='center', yanchor='top'))

fig.show()

fig.write_html('RNN_DataAugment_Acc.html')
fig.write_image('RNN_DataAugment_Acc.jpeg')

In [47]:
# Read the BERT csv history
df = pd.read_csv("./BERT.csv", encoding='UTF-8')

In [103]:
# We are interested in the validatio nacuracy only
BERT_Results=list(df['val_acc'])

In [49]:
# Compare CNN, RNN and BERT validation accuracy over epochs

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=RNN_Aug_Results, name='RNN w Data Augmentation'))
fig.add_trace(go.Scatter(x=x, y=CNN_Aug_Results, name='CNN w Data Augmentation'))
fig.add_trace(go.Scatter(x=x, y=BERT_Results, name='BERT w/o Data Augmentation'))

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.2,
    xanchor="left",
    x=0.72
), title=dict(text="Validation Accuracy of RNN, CNN w Data Augmentation vs BERT", y=0.9, x=0.5, xanchor='center', yanchor='top'))

fig.show()

fig.write_html('BERTvsCNNRNN.html')
fig.write_image('BERTvsCNNRNN.jpeg')