In [374]:
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from keras.layers import Embedding, Flatten
from bs4 import BeautifulSoup
import re
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [375]:
#dataset from https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
#too big to upload from browser into github
df = pd.read_csv('IMDB Dataset.csv',encoding='latin-1')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [376]:
df.shape

(50000, 2)

In [377]:
sentences = df['review'].values
y = df['sentiment'].values

In [378]:
# loop through every sentence and filter our capitalizations,periods,double spaces, etc. 
for i, s in enumerate(sentences):
    soup = BeautifulSoup(sentences[i], "html.parser")
    sentences[i] = soup.get_text()
    sentences[i]= re.sub('\[[^]]*\]', ' ', sentences[i])
    sentences[i] = re.sub('[^a-zA-Z]', ' ', sentences[i])
    sentences[i] = re.sub(' +', ' ', sentences[i])
    sentences[i] = sentences[i].lower()

In [379]:
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(sentences)

In [380]:
X = tokenizer.texts_to_sequences(sentences)
print(X[0])


[28, 4, 1, 77, 46, 1051, 11, 100, 149, 41, 394, 20, 230, 29, 32, 25, 204, 14, 10, 6, 612, 47, 590, 17, 68, 1, 88, 148, 11, 68, 44, 13, 91, 2, 135, 4, 559, 61, 265, 8, 204, 37, 1, 647, 141, 1721, 68, 10, 6, 23, 3, 116, 16, 1, 40, 10, 116, 56, 17, 5, 1455, 371, 40, 559, 91, 6, 8, 1, 355, 356, 4, 1, 647, 7, 6, 432, 14, 11, 6, 1, 357, 5, 1, 1030, 7, 1399, 22, 518, 34, 4, 1, 1183, 115, 30, 1, 27, 2, 385, 36, 6, 23, 297, 22, 1, 518, 6, 341, 5, 107, 2, 52, 36, 324, 2, 25, 111, 223, 240, 9, 60, 132, 1, 280, 1315, 4, 1, 116, 6, 677, 5, 1, 192, 11, 7, 266, 115, 77, 273, 570, 21, 819, 182, 1292, 16, 1214, 819, 1420, 819, 865, 152, 21, 939, 184, 1, 88, 394, 9, 123, 210, 68, 14, 36, 1606, 7, 13, 9, 411, 21, 132, 9, 13, 1568, 16, 7, 18, 14, 9, 290, 52, 9, 1403, 3, 1255, 16, 2, 190, 5, 1, 297, 4, 559, 23, 41, 559, 18, 35, 230, 29, 43, 16, 3, 35, 230, 494, 22, 627, 2, 75, 240, 17, 7, 69, 638, 694, 109, 649, 83, 1183, 677, 5, 66, 564, 4, 891, 1999, 40, 1183, 549, 149, 20, 197, 425, 17, 47, 6, 795, 1582

In [381]:
X = pad_sequences(X) 
print('X.shape = ', X.shape)

X.shape =  (50000, 1934)


In [382]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [383]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle = True, random_state=42)
X_train.shape

(37500, 1934)

In [384]:
#model with embedding layer
#flatten to fix the shape error
#output is sigmoid with 1 neuron since it is binary classification

#embedding help the accuracy reach about 87%, with a dense layer the model stayed at 50-51% the whole time
vocab_size = len(tokenizer.word_index)+1

model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=1934))
model.add(Flatten())
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])

In [385]:
model.summary()

Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_24 (Embedding)    (None, 1934, 50)          5069450   
                                                                 
 flatten_22 (Flatten)        (None, 96700)             0         
                                                                 
 dense_25 (Dense)            (None, 1)                 96701     
                                                                 
Total params: 5,166,151
Trainable params: 5,166,151
Non-trainable params: 0
_________________________________________________________________


In [386]:

# reduce = ReduceLROnPlateau(monitor='val_loss', mode='min',factor=0.2,patience=3, min_lr=0.001)

# my_callbacks = [reduce]

In [387]:
#model overfits after a couple epochs because training accuracy gets to 1.0 while val_acc still has ways to go.
#A way to fix overfitting would be to reduce learning rate but in about 2 epochs it was capping out so it went up too fast to stabalize
with tf.device('/GPU:0'):
    history=model.fit(X_train,y_train, epochs=4, verbose=True, validation_data=(X_test,y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [388]:

twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

#import 20 newgroup into a dataframe with the correct target column
df = pd.DataFrame([twenty_train.data, twenty_train.target.tolist()]).T
df.columns = ['text', 'target']

df2 = pd.DataFrame([twenty_test.data, twenty_test.target.tolist()]).T
df2.columns = ['text', 'target']

concat = [df,df2]

df3 = pd.concat(concat,ignore_index=True)

In [389]:
df3

Unnamed: 0,text,target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14
...,...,...
18841,From: richmond@spiff.Princeton.EDU (Stupendous...,14
18842,From: smytonj@murr11.alleg.edu (Jim Smyton)\nS...,4
18843,From: hhenderson@vax.clarku.edu\nSubject: RE: ...,9
18844,From: b859zam@utarlg.uta.edu \nSubject: INTEL ...,6


In [390]:
#get the texts into a list and set y to the targers
sentences = df3['text'].values
y = df3['target'].values

In [391]:
#filter out the capitals, periods, etc. 
for i, s in enumerate(sentences):
    soup = BeautifulSoup(sentences[i], "html.parser")
    sentences[i] = soup.get_text()
    sentences[i]= re.sub('\[[^]]*\]', ' ', sentences[i])
    sentences[i] = re.sub('[^a-zA-Z]', ' ', sentences[i])
    sentences[i] = re.sub(' +', ' ', sentences[i])
    sentences[i] = sentences[i].lower()

In [392]:
#tokenize the texts
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences)

In [393]:
#set the tokens to the X
X = tokenizer.texts_to_sequences(sentences)
print(X[0])

[13, 2890, 1103, 17, 144, 14, 41, 231, 31, 39, 262, 8, 18, 85, 74, 80, 2890, 1103, 17, 34, 69, 3, 2761, 509, 1595, 32, 6, 27, 1120, 28, 149, 64, 43, 105, 60, 16, 18, 262, 6, 641, 1, 79, 236, 10, 27, 4, 1166, 2226, 262, 1096, 2, 20, 13, 1, 1273, 14, 798, 14, 10, 27, 307, 4, 1, 3650, 73, 165, 447, 7, 1346, 1, 722, 27, 1775, 13, 1, 737, 3, 1, 609, 18, 8, 45, 6, 77, 28, 149, 35, 4, 742, 280, 965, 3031, 193, 3, 2775, 144, 18, 262, 8, 216, 523, 26, 728, 351, 12, 22, 16, 18, 333, 262, 167, 92, 174, 199, 1363, 1282, 2, 12, 37, 56]


In [394]:
#make them all the same length
X = pad_sequences(X) 
print('X.shape = ', X.shape)

X.shape =  (18846, 14111)


In [395]:
#makes sure the inputs are floats
X = np.asarray(X).astype('float32')
y = np.asarray(y).astype('float32')

In [396]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle = True, random_state=42)
X_train.shape

(14134, 14111)

In [397]:
#same model but with softmax output for each target, and sparse categorical loss
vocab_size = len(tokenizer.word_index)+1

model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=14111))
model.add(Flatten())
model.add(layers.Dense(20, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['acc'])

In [398]:
model.summary()

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_25 (Embedding)    (None, 14111, 50)         5248150   
                                                                 
 flatten_23 (Flatten)        (None, 705550)            0         
                                                                 
 dense_26 (Dense)            (None, 20)                14111020  
                                                                 
Total params: 19,359,170
Trainable params: 19,359,170
Non-trainable params: 0
_________________________________________________________________


In [399]:
with tf.device('/GPU:0'):
    history=model.fit(X_train,y_train, epochs=15, verbose=True, validation_data=(X_test,y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [400]:
#running the predction function
y_pred = model.predict(X_test[0:1])

#printing the prediction at the index and using argmax to print out the highest predicted value of the output to get result
print("max: ",np.argmax(y_pred[0]))
print("actual: ",y_test[0])
print("pred: ",y_pred[0])

max:  7
actual:  7.0
pred:  [5.8729828e-08 3.5482334e-04 1.0748244e-05 1.5417066e-04 3.9424252e-05
 3.5082982e-07 2.1597023e-05 9.9813831e-01 4.2570074e-04 1.7996042e-07
 4.3919854e-09 5.6350050e-08 2.9821193e-04 9.6691590e-05 4.5782377e-04
 1.7474353e-08 1.1354124e-06 1.3581185e-07 4.5496367e-07 2.1249367e-08]
