In [1]:
from data_pipeline import Pipeline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import pandas as pd
df = pd.read_csv('amazon_movie_reviews.csv')

KeyboardInterrupt: 

In [None]:
text_data = df['text'].iloc[:100]

In [None]:
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
from nltk.tokenize import word_tokenize
import nltk

In [None]:
def preprocess(corpus):
    tokens = corpus.str.lower().str.split().explode().reset_index(name='token')

    vocab = {word: idx for idx, word in enumerate(tokens['token'].unique())}

    tokens['token_index'] = tokens['token'].map(vocab)
    
    preprocessed_corpus = tokens.groupby('index')['token_index'].agg(list)

    return vocab, preprocessed_corpus

def encode(tokens, method='bow'):
    if isinstance(tokens.iloc[0], list):  # Check if the first element is a list (indicative of token indices)
        str_corpus = tokens.apply(lambda x: ' '.join(map(str, x)))
    else:
        str_corpus = tokens

    if method == 'bow':
        # Bag of Words Encoding
        vectorizer = CountVectorizer()
        encoded_data = vectorizer.fit_transform(str_corpus)
        return encoded_data, vectorizer.get_feature_names_out()

    elif method == 'tfidf':
        # TF-IDF Encoding
        vectorizer = TfidfVectorizer()
        encoded_data = vectorizer.fit_transform(str_corpus)
        return encoded_data, vectorizer.get_feature_names_out()

    elif method == 'word2vec':
        #word2vec encoding model
        tokenized_docs = [word_tokenize(doc.lower()) for doc in str_corpus]
        word2vec_model = Word2Vec(tokenized_docs, vector_size=100, window=5, min_count=2, workers=4, epochs=10)
        return word2vec_model

    else:
        raise ValueError("Unsupported encoding method")

In [9]:
# pipeline = Pipeline()


vocabulary, processed_tokens = preprocess(text_data)

print(vocabulary, processed_tokens)

{'amazon,': 0, 'please': 1, 'buy': 2, 'the': 3, 'show!': 4, "i'm": 5, 'hooked!': 6, 'my': 7, 'kiddos': 8, 'love': 9, 'this': 10, 'show!!': 11, 'annabella': 12, 'sciorra': 13, 'did': 14, 'her': 15, 'character': 16, 'justice': 17, 'with': 18, 'portrayal': 19, 'of': 20, 'a': 21, 'mentally': 22, 'ill,': 23, 'depressed': 24, 'and': 25, 'traumatized': 26, 'individual': 27, 'who': 28, 'projects': 29, 'much': 30, 'inner': 31, 'wounds': 32, 'onto': 33, 'others.': 34, 'challenges': 35, 'she': 36, 'faces': 37, 'father': 38, 'were': 39, 'sensitively': 40, 'portrayed': 41, 'resonate': 42, 'understanding': 43, 'love.': 44, 'ending': 45, 'really': 46, "isn't": 47, 'an': 48, 'ending,': 49, 'though': 50, 'feels': 51, 'like': 52, 'it': 53, 'was': 54, 'abandoned': 55, 'not': 56, 'enough': 57, 'closure': 58, 'but': 59, 'other': 60, 'than': 61, 'that,': 62, 'its': 63, 'decent': 64, 'movie': 65, 'to': 66, 'sit': 67, 'through': 68, 'if': 69, "you're": 70, 'type': 71, 'person': 72, 'likes': 73, 'people-watch'

In [10]:
encoded, feature_names = encode(processed_tokens, method='bow')
print(encoded, '\n', feature_names)

  (1, 0)	1
  (1, 111)	1
  (2, 221)	1
  (2, 232)	1
  (2, 243)	1
  (2, 254)	4
  (2, 265)	1
  (2, 276)	1
  (2, 287)	4
  (2, 298)	1
  (2, 309)	5
  (2, 320)	3
  (2, 331)	1
  (2, 342)	1
  (2, 353)	1
  (2, 364)	4
  (2, 375)	1
  (2, 386)	1
  (2, 397)	2
  (2, 408)	1
  (2, 419)	1
  (2, 430)	1
  (2, 441)	1
  (2, 452)	1
  (2, 463)	2
  :	:
  (99, 323)	1
  (99, 344)	1
  (99, 396)	1
  (99, 428)	1
  (99, 504)	1
  (99, 523)	1
  (99, 929)	1
  (99, 962)	1
  (99, 995)	1
  (99, 61)	1
  (99, 205)	1
  (99, 206)	1
  (99, 207)	1
  (99, 208)	1
  (99, 209)	1
  (99, 210)	1
  (99, 212)	1
  (99, 213)	1
  (99, 214)	1
  (99, 215)	1
  (99, 216)	1
  (99, 217)	1
  (99, 218)	1
  (99, 219)	1
  (99, 220)	1 
 ['10' '100' '1000' ... '997' '998' '999']


In [11]:
encoded, feature_names = encode(text_data, method='bow')
print(encoded, '\n', feature_names)

  (0, 44)	1
  (0, 636)	1
  (0, 134)	1
  (0, 816)	1
  (0, 752)	1
  (0, 398)	1
  (1, 752)	1
  (1, 560)	1
  (1, 460)	1
  (1, 503)	1
  (1, 828)	1
  (2, 816)	4
  (2, 503)	1
  (2, 53)	1
  (2, 726)	1
  (2, 217)	1
  (2, 383)	4
  (2, 151)	1
  (2, 453)	1
  (2, 930)	4
  (2, 640)	1
  (2, 589)	5
  (2, 528)	1
  (2, 411)	1
  (2, 209)	1
  :	:
  (99, 545)	1
  (99, 152)	1
  (99, 233)	1
  (99, 315)	1
  (99, 764)	1
  (99, 238)	1
  (99, 337)	1
  (99, 525)	1
  (99, 17)	1
  (99, 593)	1
  (99, 547)	1
  (99, 718)	1
  (99, 522)	1
  (99, 859)	1
  (99, 400)	1
  (99, 471)	1
  (99, 855)	1
  (99, 702)	1
  (99, 479)	1
  (99, 286)	1
  (99, 473)	1
  (99, 848)	1
  (99, 359)	1
  (99, 626)	1
  (99, 566)	1 
 ['10' '15' '16th' '1940' '34' '480p' '62' 'abandoned' 'able' 'about'
 'absolutely' 'absurd' 'abusive' 'accent' 'accurate' 'act' 'acted'
 'acting' 'action' 'actions' 'actors' 'actually' 'add' 'addiction' 'after'
 'again' 'against' 'age' 'ahead' 'air' 'aired' 'airs' 'airwolf' 'alittle'
 'all' 'alls' 'almost' 'alot' 'also

In [12]:
encoded, feature_names = encode(text_data, method='tfidf')
print(encoded, '\n', feature_names)

  (0, 398)	0.5075127547381553
  (0, 752)	0.2753783597774882
  (0, 816)	0.1696009788135251
  (0, 134)	0.43604121049961136
  (0, 636)	0.5075127547381553
  (0, 44)	0.43604121049961136
  (1, 828)	0.2418939435569401
  (1, 503)	0.41289632458447334
  (1, 460)	0.6990765545816118
  (1, 560)	0.3720363324057197
  (1, 752)	0.3793216094813328
  (2, 173)	0.10864539579717775
  (2, 769)	0.10864539579717775
  (2, 38)	0.08439512892755482
  (2, 437)	0.04886913009321406
  (2, 919)	0.08841961186293022
  (2, 293)	0.09334518089708448
  (2, 419)	0.10864539579717775
  (2, 370)	0.07804496599699122
  (2, 19)	0.10864539579717775
  (2, 47)	0.10864539579717775
  (2, 603)	0.0809924770889551
  (2, 902)	0.06014486205793193
  (2, 620)	0.08841961186293022
  (2, 485)	0.10864539579717775
  :	:
  (99, 17)	0.12272956973430107
  (99, 525)	0.1130089632586866
  (99, 337)	0.11026325993100881
  (99, 238)	0.1266331810468259
  (99, 764)	0.11597454877436761
  (99, 315)	0.08328199441122482
  (99, 233)	0.13099704722137054
  (99, 152)

In [13]:
encoded, feature_names = encode(processed_tokens, method='tfidf')
print(encoded, '\n', feature_names)

  (1, 111)	0.9425837889433633
  (1, 0)	0.3339697603394254
  (2, 1024)	0.10886151347263386
  (2, 1013)	0.10886151347263386
  (2, 1002)	0.08859549636167348
  (2, 991)	0.04896634068110703
  (2, 980)	0.09353086334927205
  (2, 969)	0.09353086334927205
  (2, 958)	0.10886151347263386
  (2, 947)	0.07820021322591024
  (2, 936)	0.10886151347263386
  (2, 925)	0.10886151347263386
  (2, 914)	0.08115358751383288
  (2, 903)	0.10886151347263386
  (2, 892)	0.10886151347263386
  (2, 881)	0.09353086334927205
  (2, 870)	0.09989365803879104
  (2, 859)	0.10886151347263386
  (2, 848)	0.07326484623831167
  (2, 837)	0.10886151347263386
  (2, 826)	0.10886151347263386
  (2, 815)	0.08200799432985624
  (2, 804)	0.06582293739047107
  (2, 793)	0.09989365803879104
  (2, 782)	0.0755951524815864
  :	:
  (99, 929)	0.13308951797479568
  (99, 523)	0.1230754314483612
  (99, 504)	0.15721766674167728
  (99, 428)	0.10359545178128121
  (99, 396)	0.12772360054816284
  (99, 344)	0.10896136920791404
  (99, 323)	0.0794673030143995

In [14]:
text_data = df['text'].iloc[:5000]

model = encode(text_data, method='word2vec')
print(f"Encoding for wonder:\n", model.wv['wonder'])
similar_words = model.wv.most_similar('wonder', topn=5)
print(f"Similar words to wonder:\n", similar_words)

Encoding for wonder:
 [-0.2682781   0.05656989 -0.24797943 -0.37317818  0.2778695  -0.46968704
  0.05414895  0.3541814   0.04163877 -0.39789274 -0.48068455 -0.33329538
 -0.10688213  0.1490041   0.23736641 -0.04677843 -0.1049867  -0.15110363
 -0.63354564 -0.5101781  -0.21992087  0.16236828 -0.30687767 -0.73253125
 -0.1027553   0.18832017 -0.15864213 -0.43566254 -0.27945444  0.13407333
  0.44579896  0.07903901 -0.12663959  0.12217898  0.19222319  0.7648557
  0.05564063 -0.11666182  0.24248631 -0.15704098 -0.03921103 -0.17637302
 -0.1900275  -0.25876853  0.7257035  -0.10659317 -0.42954487  0.1401092
  0.43662208 -0.32747096  0.25003207 -0.04918035  0.2782437   0.27853683
  0.1721012   0.36672452  0.27644622  0.06565977  0.27892622  0.5040151
  0.24384251 -0.06523824 -0.0947395   0.03707081 -0.33635426 -0.14368303
 -0.13097407 -0.19247366 -0.30029452  0.5511172  -0.51381344 -0.22787963
  0.16376664  0.18174072  0.37627915  0.03374196 -0.06209064  0.08206993
 -0.25402302  0.40802222 -0.0364

In [15]:
vocabulary, processed_tokens = preprocess(text_data)

model = encode(processed_tokens, method='word2vec')


def create_reverse_vocab(vocabulary):
    return {idx: word for word, idx in vocabulary.items()}

rev_vocab = create_reverse_vocab(vocabulary)

index_to_check = 748
word_to_check = rev_vocab[index_to_check]

print(f"Encoding for {word_to_check}:\n", model.wv[str(index_to_check)])
similar_indices = model.wv.most_similar(str(index_to_check), topn=5)
similar_words = [(rev_vocab[int(idx)], similarity) for idx, similarity in similar_indices]
print(f"Similar words to {word_to_check}:\n", similar_words)

Encoding for wonder:
 [ 0.16086692  0.30598474 -0.17497046 -0.20681019  0.2925442  -0.64241797
  0.44970492  0.6764204  -0.16334963 -0.52434754 -0.30554605 -0.5019915
  0.34917587  0.02098608  0.28995657 -0.31806248  0.11464727 -0.43861538
 -0.04815066 -0.7191057   0.17865695  0.00530987 -0.13059686 -0.31731272
 -0.03845894 -0.07759701 -0.334159   -0.3255919  -0.10026959  0.08671506
 -0.02669155  0.21938714 -0.10444628 -0.02559016 -0.43327513  0.59971434
 -0.1710534  -0.27486742 -0.04487635 -0.60378796  0.4245313  -0.16068615
  0.35787144  0.16793169  0.24154028 -0.0855519  -0.2070801  -0.270541
 -0.0449572   0.17332053  0.14195317 -0.24066351 -0.24884634 -0.12448142
 -0.12402733 -0.06964699  0.2039691   0.34167814 -0.04012671  0.1758891
  0.14964083 -0.04900362  0.07123719 -0.04504792 -0.08571136  0.44717497
  0.11712946  0.04788008 -0.5379231   0.24338299 -0.43478352  0.4496781
  0.27436784 -0.16529989  0.23626308  0.01201144  0.01900251 -0.00990616
 -0.35267198  0.05730054 -0.173125