### Sentiment Analysis using a Fully Connected Neural Network

In [126]:
import numpy as np
import pandas as pd

In [127]:
train = pd.read_csv("../train.csv")
val = pd.read_csv("../val.csv")

In [128]:
from sklearn.feature_extraction.text import CountVectorizer

In [129]:
vectorizer = CountVectorizer() 

In [130]:
train_words = train.iloc[:,2]
train_labels = train.iloc[:,1]

In [131]:
def to_vec (arr): 
    one = [1, 0, 0, 0, 0]
    two = [0, 1, 0, 0, 0]
    three = [0, 0, 1, 0, 0]
    four = [0, 0, 0, 1, 0]
    five = [0, 0, 0, 0, 1]
    train_labels = []
    
    for i in range (0, arr.shape[0]): 
        x = arr.iloc[i]
        if x <= 0.2: 
            value = one
        elif x <= 0.4: 
            value = two
        elif x <= 0.6: 
            value = three
        elif x <= 0.8: 
            value = four 
        elif x <= 1:
            value = five
        else: 
            print(x)
            print(arr.iloc[x])
            assert(1 == 2)
        
        train_labels.append(value)
    
    train_labels = pd.DataFrame(train_labels)
    return train_labels

In [132]:
train_labels = to_vec(train_labels)
train_features = vectorizer.fit_transform(train_words)
print(train_features.shape)
print(train_labels.shape)

(191385, 18018)
(191385, 5)


In [133]:
val_words = val.iloc[:,2]
val_labels = val.iloc[:,1]
val_labels = to_vec(val_labels)
val_features = vectorizer.transform(val_words)
print(val_labels.shape)
print(val_features.shape)

(23923, 5)
(23923, 18018)


In [134]:
from sklearn import preprocessing 
train_features = preprocessing.normalize(train_features)
val_features = preprocessing.normalize(val_features)

### Fully Connected Neural Network

In [135]:
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

In [136]:
neural_net = MLPClassifier(verbose=True, warm_start=True)

In [None]:
neural_net.fit(train_features, train_labels)

Iteration 1, loss = 1.90334580
Iteration 2, loss = 1.49005899
Iteration 3, loss = 1.37995925
Iteration 4, loss = 1.30796301
Iteration 5, loss = 1.24974313
Iteration 6, loss = 1.19978869
Iteration 7, loss = 1.15564687
Iteration 8, loss = 1.11591824
Iteration 9, loss = 1.07987237
Iteration 10, loss = 1.04707728
Iteration 11, loss = 1.01605750
Iteration 12, loss = 0.98722461
Iteration 13, loss = 0.96028024
Iteration 14, loss = 0.93520971
Iteration 15, loss = 0.91120840
Iteration 16, loss = 0.88824172
Iteration 17, loss = 0.86688686
Iteration 18, loss = 0.84633486
Iteration 19, loss = 0.82729952
Iteration 20, loss = 0.80832612
Iteration 21, loss = 0.79124250
Iteration 22, loss = 0.77518092
Iteration 23, loss = 0.75939748
Iteration 24, loss = 0.74438497
Iteration 25, loss = 0.73087342
Iteration 26, loss = 0.71744769
Iteration 27, loss = 0.70529355
Iteration 28, loss = 0.69295867


### Metrics

In [None]:
val_pred = neural_net.predict(train_features)
val_pred = np.argmax(val_pred, axis=1)
val_pred = np.apply_along_axis(lambda x: x + 1, 0, val_pred)

In [None]:
val_labels.shape
val_labels = val_labels.to_numpy()
val_labels = np.argmax(val_labels, axis=1)
val_labels = np.apply_along_axis(lambda x: x + 1, 0, val_labels)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(val_pred, val_labels)

### Graphing

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import rc
sns.set_style("white")
rc('text', usetex=True)

In [None]:
sns.boxplot(val_pred, val_labels)
plt.xlabel("Predicted Value", fontsize="medium")
plt.ylabel("Actual Value", fontsize="medium")
plt.title("Neural Net Performance")
plt.tight_layout()
plt.savefig("neural net box.png", bbox_inches="tight", dpi=400)

In [None]:
sns.violinplot(val_pred, val_labels)
plt.xlabel("Predicted Value", fontsize="medium")
plt.ylabel("Actual Value", fontsize="medium")
plt.title("Neural Net Performance")
plt.tight_layout()
plt.savefig("visualization/neural net violin.png", bbox_inches="tight", dpi=400)