In [1]:
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import roc_auc_score 
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np

In [2]:
data = pd.read_json("./data/3xNCS.json")

In [3]:
data.head()

Unnamed: 0,sentence_id,label,text
0,8967,1,"In other words, I have seen his program costed..."
1,27385,1,"Our Navy is old -- excuse me, our Navy is smal..."
2,9818,1,"The unemployment, the number of people who are..."
3,16794,1,Mr. Ford uh - actually has fewer people now in...
4,17588,1,"Today it is up to about $38,000 of earnings th..."


In [4]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(data.text, data.label, test_size = 0.20, random_state = 0)

In [5]:
tokenizer = Tokenizer(
    num_words = 1000,
    filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    oov_token = '<UNK',
    char_level=False
)
tokenizer.fit_on_texts(x_train)

In [6]:
# Get our training data word index
word_index = tokenizer.word_index
print("Word index:\n", word_index)

Word index:


In [None]:
# Encode training data sentences into sequences
train_sequences = tokenizer.texts_to_sequences(x_train)

# Get max training sequence length
maxlen = max([len(x) for x in train_sequences])

# Pad the training sequences
pad_type = trunc_type = 'pre'
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen, value=0.0)

# Output the results of our work

print("\nTraining sequences:\n", train_sequences)



Training sequences:
 [[22, 75, 452, 17, 2, 215, 5, 30, 10, 11, 500, 15], [27, 288, 27, 1, 12, 416, 46, 401, 119, 165, 33, 2, 1, 5, 1, 1, 2, 417, 5, 24, 46, 3, 19, 283, 4, 46, 401, 119, 165, 33, 30, 2, 108, 94, 24, 469, 4, 478, 1, 39, 26, 9, 2, 1, 46, 89, 64, 1, 57, 2, 779, 3, 115, 106, 1], [4, 276, 115, 367, 15, 69], [81, 7, 359, 380, 20, 6], [51, 14, 21, 501, 4, 1, 5, 110, 341, 14, 26, 23, 56, 111, 3, 2, 653, 5, 99, 1, 4, 96, 1], [40, 20, 654, 3, 760, 131, 65, 63, 44, 171, 447], [131, 131, 226], [41, 8, 170, 780, 15, 65, 3, 19, 121, 947, 121, 27, 2, 326, 1], [4, 1, 2, 175, 339, 781, 1, 54, 6, 85, 5, 84, 1, 432, 11, 1, 566, 6, 2, 1, 12, 1, 15, 237, 4, 124, 528, 852, 207, 630, 2, 1, 5, 2, 566, 4, 1, 6, 191, 1, 1, 1, 3, 237], [36, 52, 30, 276, 26, 27, 34], [35, 14, 63, 59, 83, 583, 167, 4, 70, 3, 567, 716, 250, 56, 8, 1, 22, 23, 27, 1], [7, 25, 10, 50, 153, 631, 248, 20, 203, 272, 9, 2, 479, 273, 222, 606, 4, 1, 4, 86, 226, 11, 1, 948, 1], [7, 28, 9, 1, 1, 4, 8, 676, 1, 48, 1, 4, 263, 7

In [None]:
print("maxlen: ", maxlen)
print("\nPadded training sequences:\n", train_padded)
print("\nPadded training shape:", train_padded.shape)
print("Training sequences data type:", type(train_sequences))
print("Padded Training sequences data type:", type(train_padded))

maxlen:  151

Padded training sequences:
 [[  0   0   0 ...  11 500  15]
 [  0   0   0 ... 115 106   1]
 [  0   0   0 ... 367  15  69]
 ...
 [  0   0   0 ...   6  10 100]
 [  0   0   0 ...   2 218   1]
 [  0   0   0 ...  17 197   6]]

Padded training shape: (8844, 151)
Training sequences data type: <class 'list'>
Padded Training sequences data type: <class 'numpy.ndarray'>


In [9]:
test_sequences = tokenizer.texts_to_sequences(x_valid)
test_padded = pad_sequences(test_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen, value=0.0)

print("Testing sequences:\n", test_sequences)
print("\nPadded testing sequences:\n", test_padded)
print("\nPadded testing shape:",test_padded.shape)

#x_valid = pad_sequences(x_valid, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value=0)

Testing sequences:
 [[62, 1, 377, 109, 5, 2, 1, 1], [10, 11, 874, 130, 24, 390, 119, 245, 42, 37, 4, 7, 146, 69, 37, 2, 87], [9, 2, 523, 1, 10, 68, 1, 13, 111, 3, 33, 1], [4, 75, 31, 3, 181, 17, 1, 69, 45, 7, 53, 25, 62, 1, 264, 299], [13, 1, 1, 2, 348, 432, 6, 1, 11, 1, 130, 1, 6, 161, 23, 777, 6, 656, 14, 274, 69, 58, 14, 726, 69], [10, 53, 151, 391, 18, 613, 174, 10, 729, 111, 42, 2, 1, 829], [13, 1, 320, 9, 195, 2, 320, 7, 61, 773, 190, 2, 320, 12, 1, 42, 1, 130, 15, 16, 548, 102, 1, 130], [22, 30, 263, 321, 86, 12, 6, 34, 384, 47, 1, 665, 4, 665, 5, 24, 61, 2, 101, 13, 12], [370, 41, 8, 443, 240, 27, 99, 278, 247, 22, 1, 23, 64, 1, 240], [49, 47, 55, 2, 354, 49, 47, 570, 3, 181, 182, 159, 301], [40, 13, 12, 593, 6, 48, 701, 4, 1, 1, 1, 15, 331, 153, 491, 3, 189, 8, 299, 59, 9, 164, 9, 476], [81, 7, 77, 8, 462, 1, 217, 1, 6, 14, 153, 1, 2, 525, 5, 327, 4, 1, 4, 685, 127, 42, 2, 191, 87], [4, 74, 145, 13, 447, 6, 224, 20, 18, 669, 4, 503, 6, 10, 21, 1, 3, 1, 119, 1, 1, 88, 392, 1, 1

In [10]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(train_padded, y_train)

In [11]:
testdata = pd.read_csv("./data/groundtruth.csv")

In [12]:
testdata.head()

Unnamed: 0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict
0,26,"You know, I saw a movie - ""Crocodile Dundee.""",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,9,26,0.0,0
1,80,We're consuming 50 percent of the world's coca...,Michael Dukakis,Governor,DEMOCRAT,1988-09-25.txt,8,80,-0.740979,1
2,129,That answer was about as clear as Boston harbor.,George Bush,Vice President,REPUBLICAN,1988-09-25.txt,9,129,0.0,-1
3,131,Let me help the governor.,George Bush,Vice President,REPUBLICAN,1988-09-25.txt,5,131,0.212987,-1
4,172,We've run up more debt in the last eight years...,Michael Dukakis,Governor,DEMOCRAT,1988-09-25.txt,22,172,-0.268506,1


In [13]:
x_test = testdata['Text']
test_xsequences = tokenizer.texts_to_sequences(x_test)
test_xpadded = pad_sequences(test_xsequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen, value=0.0)

In [14]:
# combining UFS sentences and CFS sentences into FC sentences
testdata.loc[testdata['Verdict'] == 1, 'label'] = 1
testdata.loc[testdata['Verdict'] == 0, 'label'] = 1
testdata.loc[testdata['Verdict'] == -1, 'label'] = 0

y = testdata['label'].astype(np.int64)

In [15]:
y_hat = classifier.predict(test_xpadded)

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(test_xpadded)
cm = confusion_matrix(y, y_pred)
print(cm)
accuracy_score(y, y_pred)

[[722   9]
 [234  67]]


0.7645348837209303