In [None]:
import collections
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow_text as tf_text
np.set_printoptions(precision=3, suppress=True)


In [None]:
fileName = '../Data/Reddit_News_DJIA.csv'

# Load CSV data with tf.data

In [None]:
rawData = tf.data.TextLineDataset(fileName)
rawData.element_spec

# Load CSV data with pandas

In [None]:
redditNewsDf = pd.read_csv(fileName)
redditNewsDf.head(1)
#redditNewsDf.Top1.values[0]

In [None]:
rows, cols = redditNewsDf.shape
cols = cols - 2 # subtract date and label
print('Rows =', rows)
print('Columns =', cols)
N = rows * cols # N = Number of documents
print('N: number of docs = ', N)

In [None]:
dfLabels = redditNewsDf.Label.values.reshape(-1,1)
dfLabels[:5]

In [None]:

#dfFeatures = np.char.encode(redditNewsDf.iloc[:,2:].values, encoding='utf-8')
dfFeatures = redditNewsDf.iloc[:,2:].values
dfFeatures = np.char.strip(np.asarray(dfFeatures,dtype=str),chars='b\'\"')

print(dfFeatures[0])

In [None]:
dfFeatures = dfFeatures.reshape(-1)
dfFeatures.shape

In [None]:
dfLabels = dfLabels * np.ones((rows,cols))
dfLabels = dfLabels.reshape(-1)
dfLabels.shape

# TF

In [None]:
#lines = [dfLabels,dfFeatures]

featuresDS = tf.data.Dataset.from_tensor_slices(list(dfFeatures))
labelDS =  tf.data.Dataset.from_tensor_slices(tf.cast(list(dfLabels), tf.int64))
#linesDS = featuresDS.concatenate(tf.data.Dataset.from_tensor_slices(list(tf.cast(dfLabels,tf.int64))))
#for i,row in featuresDS.enumerate().as_numpy_iterator():
#    featuresDS.map(lambda element: [element, tf.cast(dfLabels[i], tf.int64)] )
labeledDS = tf.data.Dataset.zip((featuresDS, labelDS))
#list(labeledDS.as_numpy_iterator())


In [None]:
tokenizer = tf_text.UnicodeScriptTokenizer()

In [None]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
VALIDATION_SIZE = 5000
VOCAB_SIZE = 10000

In [None]:
def tokenize(text):
  lower_case = tf_text.case_fold_utf8(text)
  return tokenizer.tokenize(lower_case)

In [None]:
tokenized_ds = featuresDS.map(tokenize)

In [None]:
for text_batch in tokenized_ds.take(5):
  print("Tokens: ", text_batch.numpy())

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
tokenized_ds = configure_dataset(tokenized_ds)

vocab_dict = collections.defaultdict(lambda: 0)
for toks in tokenized_ds.as_numpy_iterator():
  for tok in toks:
    vocab_dict[tok] += 1

vocab = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True)
vocab = [token for token, count in vocab]
vocab = vocab[:VOCAB_SIZE]
vocab_size = len(vocab)
print("Vocab size: ", vocab_size)
print("First five vocab entries:", vocab[:5])

# Scikit

In [None]:
trainPercent = 0.8
dfFeatures_train = dfFeatures[:int(trainPercent*N)]
dfLabels_train = dfLabels[:int(trainPercent*N)]
dfFeatures_test = dfFeatures[int(trainPercent*N):]
dfLabels_test = dfLabels[int(trainPercent*N):]

print('Training features =',dfFeatures_train.shape)
print('Training labels =',dfLabels_train.shape)
print('Testing features =',dfFeatures_test.shape)
print('Testing labels =',dfLabels_test.shape)

In [None]:
type(np.asarray(dfFeatures,dtype=str))
dfFeatures.shape

In [None]:
vectorizer = CountVectorizer(max_features=10000)
#vectorizer.fit(np.asarray(dfFeatures_train,dtype=str))
vectorizer.fit(dfFeatures_train)
vocabulary = np.array(list(vectorizer.vocabulary_.items()))
print('First 5 indexed vocab words:\n', vocabulary[:5])
M = vocabulary.shape[0]
print('\nNumber of words =', M)

In [None]:
docMatrix_train = vectorizer.transform(dfFeatures_train).toarray()
print(dfFeatures_train[0])
print('First 5 rows of Doc Matrix:\n', docMatrix_train[:5])

print('\nDoc Matrix Shape =', docMatrix_train.shape)

In [None]:
docMatric_test = vectorizer.transform(dfFeatures_test).toarray()

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(tol=0.1, max_iter=500)
classifier.fit(docMatrix_train, dfLabels_train)
score = classifier.score(docMatric_test, dfLabels_test)

print("Accuracy:", score)