In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import os

In [2]:
def load_data(training_arr, testing_arr, path):
    # Load training set
    for folder in os.listdir(path):
        extended_path = path+"/"+folder
        if folder != 'fold5':
            for filename in os.listdir(extended_path):
                if filename.endswith('.txt'):
                    with open(os.path.join(extended_path, filename), encoding='utf-8') as f:
                        training_arr.append(f.read())
        else:
            for filename in os.listdir(extended_path):
                if filename.endswith('.txt'):
                    with open(os.path.join(extended_path, filename), encoding='utf-8') as f:
                        testing_arr.append(f.read())

In [3]:
negative_real_path = 'op_spam_v1.4/negative_polarity/deceptive_from_MTurk'

# negative_false_path = 'op_spam_v1.4/negative_polarity/deceptive_from_MTurk'

print ("TEST:", os.listdir(negative_real_path))

neg_real_train = []
neg_real_test = []
neg_fake_train = []
neg_fake_test = []

neg_real_path = 'op_spam_v1.4/negative_polarity/truthful_from_Web'
neg_fake_path = 'op_spam_v1.4/negative_polarity/deceptive_from_MTurk'



TEST: ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']


In [4]:
load_data(neg_real_train, neg_real_test, neg_real_path)
load_data(neg_fake_train, neg_fake_test, neg_fake_path)

#corpus = np.concatenate((neg_real_train, neg_real_test, neg_fake_train, neg_fake_test))

corpus = np.concatenate((neg_real_train,neg_fake_train))

test_corpus = np.concatenate((neg_real_test,neg_fake_test))

vectorizer = CountVectorizer()

# Fit and transform the corpus
X = vectorizer.fit_transform(corpus)

# Display the vocabulary
print("Vocabulary:", vectorizer.vocabulary_)

# Display the Bag of Words frequency matrix
print("Bag of Words (as array):\n", X.toarray())

Bag of Words (as array):
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [5]:
vocab = vectorizer.get_feature_names_out()
word_counts = np.asarray(X.sum(axis=0)).flatten()
word_freq = list(zip(vocab, word_counts))
sorted_word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)
for word, count in sorted_word_freq:
    print(f"{word}: {count}")

the: 7542
to: 3525
and: 3340
was: 2812
in: 1945
of: 1592
we: 1529
room: 1490
for: 1459
it: 1405
hotel: 1378
that: 1218
not: 1174
my: 1155
at: 1099
had: 919
on: 850
they: 850
were: 811
but: 782
this: 782
with: 773
is: 766
our: 658
there: 611
have: 590
when: 587
be: 586
you: 557
chicago: 535
stay: 531
as: 487
me: 471
would: 467
very: 436
so: 419
no: 412
all: 407
from: 394
one: 388
service: 387
up: 371
out: 349
an: 313
are: 311
desk: 311
get: 305
us: 304
did: 303
night: 301
staff: 299
after: 296
if: 288
like: 279
about: 265
rooms: 258
even: 257
stayed: 253
or: 248
front: 244
could: 239
time: 235
just: 233
will: 232
again: 230
bed: 227
what: 227
only: 225
been: 219
by: 218
which: 211
back: 206
got: 201
more: 189
check: 188
never: 187
their: 185
first: 183
called: 178
because: 177
two: 177
didn: 176
good: 176
here: 174
nice: 174
some: 174
other: 173
than: 173
also: 171
day: 171
bathroom: 170
down: 169
told: 168
experience: 166
them: 166
do: 164
he: 164
next: 163
great: 161
made: 157
better:

In [6]:
neg_real_train[0]


'My $200 Gucci sunglasses were stolen out of my bag on the 16th. I filed a report with the hotel security and am anxious to hear back from them. This was such a disappointment, as we liked the hotel and were having a great time in Chicago. Our room was really nice, with 2 bathrooms. We had 2 double beds and a comfortable hideaway bed. We had a great view of the lake and park. The hotel charged us $25 to check in early (10am).\n'

In [7]:
import pandas as pd
entries = neg_real_train + neg_fake_train
labels = ['real'] * len(neg_real_train) + ['fake'] * len(neg_fake_train)

# Create DataFrame
df = pd.DataFrame({
    'text': entries,
    'label': labels
})

print(df)

                                                  text label
0    My $200 Gucci sunglasses were stolen out of my...  real
1    This was a gorgeous hotel from the outside and...  real
2    The hotel is very impressive upon entering and...  real
3    Going to the Internet Retailer 2010 at the las...  real
4    I checked into this hotel, Rm 1760 on 11/13/20...  real
..                                                 ...   ...
635  Grant it, this hotel seems very nice, but I wa...  fake
636  I booked a room at Swissotel Chicago because I...  fake
637  My husband and I arrived at the Swissotel Chic...  fake
638  I had really high hopes for this hotel. The lo...  fake
639  The Swissotel Chicago is a very mediocre hotel...  fake

[640 rows x 2 columns]


In [8]:
entries_test = neg_real_test + neg_fake_test
labels_test = ['real'] * len(neg_real_test) + ['fake'] * len(neg_fake_test)

# Create DataFrame
df_test = pd.DataFrame({
    'text': entries_test,
    'label': labels_test
})

print(df_test)

                                                  text label
0    This hotel must have originally been an ordina...  real
1    My Husband and I did not enjoy stay at Hotel A...  real
2    I stayed at the Allegro Hotel while interviewi...  real
3    This is about the fourth time we stayed at the...  real
4    This hotel is rather stupid to be frank. It's ...  real
..                                                 ...   ...
155  Overall, the hotel was okay. Though I did have...  fake
156  I was not pleased with my recent stay at the P...  fake
157  Our visit started off on the wrong foot when w...  fake
158  Though grand and having a brand, this hotel se...  fake
159  The Palmer House Hilton, while it looks good i...  fake

[160 rows x 2 columns]


In [13]:
df.to_pickle("untracked_data/data_prepro_train_01.pkl")
df_test.to_pickle("untracked_data/data_prepro_test_01.pkl")

In [12]:
print("hey")

hey
