In [1]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

In [2]:
from nltk.tokenize import word_tokenize

In [3]:
stopwords = set(stopwords.words('english'))

In [4]:
df=pd.read_csv('EcomReviews_8k.csv')

In [5]:
df.columns

Index(['labels', 'text'], dtype='object')

In [6]:
df.sample(1)

Unnamed: 0,labels,text
5566,__label__2,"Wow.: Unlike most albums, this CD does not hav..."


In [7]:
df['labels'].value_counts()

labels
__label__1    4002
__label__2    3998
Name: count, dtype: int64

In [8]:
df.isnull().sum()

labels    0
text      0
dtype: int64

In [36]:
stemmer    = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    n=3
    sep=' '
    text = text.lower() # Lower case
    tokens = re.findall(r'\w+', text) # Extract tokens using regex
#     tokens =  word_tokenize(text) # Extract tokens using nltk
#     tokens = [ lemmatizer.lemmatize(word) for word in tokens] # Lammatization
#     tokens = [stemmer.stem(word) for word in tokens] # Stemming
    tokens = [sep.join(ngram) for ngram in zip(*[tokens[i:] for i in range(n)]) if len([t for t in ngram if t in stopwords])==0]   
    return tokens

In [37]:
df['tokens'] = df['text'].apply(lambda x: preprocess(x))

In [38]:
# Clean data using the built in cleaner in gensim
df['text_clean'] = df['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
df.head()

Unnamed: 0,labels,text,tokens,text_clean
0,1,Rose Dawson Malaysia: All must buy the album o...,"[rose dawson malaysia, 566 times believe]","[rose, dawson, malaysia, all, must, buy, the, ..."
1,1,a precious resource for Christian contemplatio...,"[love without soft, without soft pedalling, bu...","[precious, resource, for, christian, contempla..."
2,0,"Rename it to ""RedSox, Yankees and Giants"": I'v...","[always trusted ken, trusted ken burns, good f...","[rename, it, to, redsox, yankees, and, giants,..."
3,0,Boring!: This game is based on a great concept...,"[control options pass, roller coaster tycoon, ...","[boring, this, game, is, based, on, great, con..."
4,0,Lost in the 70's: The Eurovision song contest ...,"[eurovision song contest, song everyone knows,...","[lost, in, the, the, eurovision, song, contest..."


In [12]:
# Encoding the label column
df['labels']=df['labels'].replace({'__label__2':1,'__label__1':0})
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text_clean'], df['labels'] , test_size=0.2)

In [13]:
df['labels']

0       1
1       1
2       0
3       0
4       0
       ..
7995    1
7996    0
7997    1
7998    0
7999    1
Name: labels, Length: 8000, dtype: int64

In [39]:
# Train the word2vec model from scratch
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2,sg=0)#cbow

In [58]:
len(w2v_model.wv.index_to_key)
## Accessing the index_to_key attribute

13509

In [37]:
# w2v_model.wv['light']

In [40]:
words = set(w2v_model.wv.index_to_key )
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming X_train and X_test are lists of sequences of words

# Convert words to word vectors and pad sequences
X_train_vect = pad_sequences([[w2v_model.wv[i] for i in ls if i in words] for ls in X_train], dtype='float32', padding='post')
X_test_vect = pad_sequences([[w2v_model.wv[i] for i in ls if i in words] for ls in X_test], dtype='float32', padding='post')


In [42]:
X_test_vect[0].shape

(184, 100)

In [19]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))

61 192
149 192
29 192
98 192
64 192
68 192
107 192
29 192
95 192
37 192
123 192
36 192
128 192
64 192
53 192
92 192
123 192
43 192
68 192
19 192
18 192
87 192
96 192
49 192
72 192
48 192
88 192
51 192
121 192
47 192
37 192
27 192
75 192
122 192
70 192
37 192
65 192
46 192
57 192
104 192
174 192
37 192
168 192
50 192
39 192
100 192
62 192
31 192
28 192
159 192
75 192
106 192
54 192
21 192
39 192
130 192
75 192
110 192
66 192
155 192
89 192
143 192
31 192
72 192
105 192
153 192
24 192
52 192
155 192
63 192
112 192
24 192
67 192
42 192
133 192
66 192
75 192
38 192
53 192
33 192
42 192
94 192
24 192
29 192
51 192
33 192
47 192
75 192
78 192
119 192
148 192
22 192
16 192
50 192
121 192
137 192
36 192
36 192
84 192
35 192
52 192
28 192
89 192
94 192
69 192
26 192
27 192
37 192
117 192
96 192
20 192
27 192
136 192
81 192
53 192
76 192
42 192
111 192
103 192
41 192
30 192
26 192
55 192
30 192
144 192
65 192
74 192
146 192
145 192
68 192
95 192
37 192
142 192
43 192
42 192
61 192
28 192
23 192


If you're using word embeddings like Word2Vec or GloVe, each word in the sentence is typically represented by a fixed-length vector. When these vectors are combined to represent a sentence, they might be aggregated in various ways (e.g., averaging, summing, or concatenating), resulting in a single vector representation for the entire sentence. Consequently, the length of the sentence vector will not be the same as the number of words in the sentence.

In [22]:
for i, (sentence, vector) in enumerate(zip(X_train, X_train_vect)):
    print("Original Sentence:", X_train.iloc[i])
    print("Sentence Vector:", vector)
    print("Lengths:", len(X_train.iloc[i]), len(vector))
#relationship between the original sentences and their vector representations.

Original Sentence: ['boring', 'movie', 'what', 'disappointment', 'this', 'movie', 'was', 'it', 'had', 'really', 'good', 'actors', 'but', 'it', 'was', 'slow', 'uninteresting', 'and', 'just', 'waste', 'of', 'time']
Sentence Vector: [[ 1.82722986e-01  6.70467496e-01 -3.48889480e-05 ... -2.33811557e-01
   1.24643810e-01  6.56836480e-02]
 [ 2.73288488e-01  1.24225736e+00  4.63161230e-01 ... -6.33891761e-01
   1.76727027e-01 -2.16413200e-01]
 [-5.31239688e-01  1.72056055e+00 -3.42630707e-02 ... -5.69946527e-01
   3.03166807e-01 -1.12148261e+00]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]
Lengths: 22 192
Original Sentence: ['pretty', 'dog', 'gone', 'good', 'dennis', 'lehane', 'has', 'another', 'good', 'one', 'in', 'scare

Sentence Vector: [[-0.04301374 -0.01508791  0.19039413 ... -0.8777833   0.39390951
   0.38942823]
 [-1.04545164  0.92503554  0.1600565  ... -1.45900106  0.3191568
  -0.32224166]
 [ 1.01599658  0.26523092  0.35013568 ... -0.30728203 -0.31977603
   0.60800707]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]
Lengths: 72 192
Original Sentence: ['not', 'like', 'it', 'used', 'to', 'be', 'boy', 'nestle', 'chocolate', 'is', 'not', 'at', 'all', 'as', 'it', 'used', 'to', 'be', 'used', 'to', 'love', 'it', 'you', 'can', 'find', 'it', 'in', 'store', 'and', 'now', 'know', 'why', 'it', 'awful', 'hadn', 'seen', 'it', 'for', 'so', 'long', 'then', 'saw', 'it', 'on', 'line', 'ate', 'few', 'the', 'last', 'one', 'couldn', 'even', 'finish', 'it', 'was', 'so', 'awful', 'to', 'me', 'gave', 'my', 'friend', 'the', 'box', 'told', 'her', '

Sentence Vector: [[ 7.36526623e-02  4.34450537e-01  1.64103642e-01 ... -2.49380767e-01
   1.01230249e-01  1.28858462e-01]
 [ 2.61411513e-03  1.28805032e-02  8.61986820e-03 ...  3.53197800e-04
  -6.63644169e-04  4.71613184e-03]
 [-1.04545164e+00  9.25035536e-01  1.60056502e-01 ... -1.45900106e+00
   3.19156796e-01 -3.22241664e-01]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]
Lengths: 24 192
Original Sentence: ['watch', 'it', 'often', 'have', 'brothers', 'they', 'need', 'this', 'movie', 'also', 'have', 'foster', 'brothers', 'has', 'childrenand', 'used', 'it', 'for', 'visual', 'reality', 'check', 'on', 'his', 'behaviorto', 'improve', 'hisself', 'and', 'heal', 'his', 'heart', 'for', 'godits', 'fanulous']
Sentence Vecto

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [44]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

##### computes sentence vectors by averaging the word vectors for the words contained in each sentence. 
- X_train_vect_avg and X_test_vect_avg are initialized as empty lists to store the averaged sentence vectors for the training and test datasets, respectively.
- The code iterates over each sentence vector in X_train_vect and X_test_vect. For each sentence vector v, it checks if v is not empty (i.e., it contains at least one word vector). If v is not empty, it computes the mean (average) of the word vectors along the first axis (axis 0), which corresponds to averaging the word vectors for each dimension. The resulting mean vector represents the averaged sentence vector for the current sentence. If v is empty (i.e., all word vectors are zero vectors), it appends a zero vector of the same dimensionality (100 in this case) to the list as a placeholder.
- After iterating over all sentence vectors, X_train_vect_avg and X_test_vect_avg contain the averaged sentence vectors for the training and test datasets, respectively.
- This approach of averaging word vectors to obtain sentence vectors is a common technique in natural language processing (NLP) tasks. It allows you to capture the overall semantic meaning of a sentence based on the meanings of its constituent words.

- The np.zeros(100, dtype=float) part in the code creates a zero vector of length 100, which matches the dimensionality of the word vectors. This zero vector is used as a placeholder for sentences with no words or out-of-vocabulary words, ensuring that all sentence vectors have the same dimensionality.

In [45]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

112 100
91 100
71 100
53 100
73 100
45 100
79 100
62 100
97 100
52 100
19 100
35 100
93 100
33 100
53 100
24 100
74 100
38 100
20 100
150 100
78 100
31 100
55 100
22 100
26 100
63 100
56 100
70 100
35 100
97 100
98 100
39 100
93 100
85 100
94 100
55 100
133 100
23 100
140 100
171 100
39 100
90 100
106 100
68 100
26 100
113 100
103 100
99 100
94 100
54 100
115 100
52 100
83 100
78 100
101 100
112 100
27 100
119 100
95 100
64 100
97 100
70 100
35 100
23 100
106 100
25 100
102 100
119 100
130 100
45 100
40 100
51 100
49 100
61 100
132 100
95 100
72 100
113 100
46 100
37 100
45 100
27 100
30 100
150 100
141 100
56 100
45 100
119 100
119 100
117 100
79 100
123 100
48 100
29 100
47 100
132 100
42 100
37 100
172 100
77 100
102 100
38 100
81 100
45 100
26 100
151 100
62 100
75 100
96 100
44 100
153 100
21 100
103 100
154 100
33 100
107 100
53 100
47 100
21 100
43 100
161 100
170 100
160 100
100 100
102 100
67 100
29 100
112 100
101 100
58 100
75 100
65 100
55 100
45 100
108 100
87 100
100 100


In [26]:
for i, (sentence, vector) in enumerate(zip(X_train, X_train_vect_avg)):
    print("Original Sentence Length:", len(sentence))
    print("Averaged Sentence Vector Length:", len(vector))


Original Sentence Length: 79
Averaged Sentence Vector Length: 100
Original Sentence Length: 95
Averaged Sentence Vector Length: 100
Original Sentence Length: 140
Averaged Sentence Vector Length: 100
Original Sentence Length: 102
Averaged Sentence Vector Length: 100
Original Sentence Length: 31
Averaged Sentence Vector Length: 100
Original Sentence Length: 33
Averaged Sentence Vector Length: 100
Original Sentence Length: 37
Averaged Sentence Vector Length: 100
Original Sentence Length: 107
Averaged Sentence Vector Length: 100
Original Sentence Length: 47
Averaged Sentence Vector Length: 100
Original Sentence Length: 43
Averaged Sentence Vector Length: 100
Original Sentence Length: 63
Averaged Sentence Vector Length: 100
Original Sentence Length: 89
Averaged Sentence Vector Length: 100
Original Sentence Length: 58
Averaged Sentence Vector Length: 100
Original Sentence Length: 49
Averaged Sentence Vector Length: 100
Original Sentence Length: 55
Averaged Sentence Vector Length: 100
Origina

In [46]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression(max_iter=400)                                                 
regressor = regressor.fit(X_train_vect_avg, y_train.values)

In [28]:
y_train.values

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [47]:
# Use the trained model to make predictions on the test data
y_pred = regressor.predict(X_test_vect_avg)

In [48]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.695 / Recall: 0.696 / Accuracy: 0.688


In [31]:
y_test

2141    0
5616    0
1978    1
3112    1
6530    0
       ..
4600    0
313     1
6400    1
4250    0
7429    1
Name: labels, Length: 1600, dtype: int64

In [52]:
w2v_model.wv.doesnt_match(['breakfast','cereal','dinner','lunch'])

'lunch'

In [55]:
w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=10)

[('moving', 0.983867347240448),
 ('practically', 0.9784531593322754),
 ('stone', 0.9775813817977905),
 ('fashion', 0.9772758483886719),
 ('henry', 0.9767299890518188),
 ('filled', 0.9760727286338806),
 ('fish', 0.974877655506134),
 ('vol', 0.9747515320777893),
 ('blu', 0.9746339917182922),
 ('humour', 0.9744937419891357)]

### Glove

In [56]:
model = gensim.models.KeyedVectors.load_word2vec_format('glove.txt', binary=False,no_header=True)

In [57]:
model.get_vector('light')

array([ 0.0062958,  0.47249  , -0.073297 , -0.0060334,  0.36752  ,
       -0.22067  ,  0.47872  , -0.33874  ,  0.091716 ,  0.09293  ,
        0.40365  ,  0.030591 ,  0.29251  ,  0.30817  , -0.78066  ,
        0.32136  , -0.69529  ,  0.27155  , -1.5156   , -1.7707   ,
        0.35877  , -0.11012  ,  0.40589  , -0.7165   , -0.066692 ,
       -1.0795   , -0.67795  ,  1.0202   ,  1.0186   ,  0.29357  ,
        3.059    , -0.023482 , -0.10365  , -0.82797  ,  0.28177  ,
       -0.16825  ,  0.20761  , -0.085368 , -0.46009  ,  0.057375 ,
        0.33407  ,  0.23124  ,  0.054707 , -0.34894  ,  0.075528 ,
        0.53281  ,  0.22283  , -0.95259  , -0.028099 , -0.054563 ],
      dtype=float32)

In [20]:
len(model.index_to_key)

400000

In [59]:
words = set(model.index_to_key )
#For each sequence of words in X_train, the list comprehension [model[i] for i in ls if i in words] iterates through each word (i) in the sequence ls and retrieves its word vector using model[i].
X_train_vect = pad_sequences([[model[i] for i in ls if i in words] for ls in X_train], dtype='float32', padding='post')
X_test_vect = pad_sequences([[model[i] for i in ls if i in words] for ls in X_test], dtype='float32', padding='post')

In [60]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [61]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression(max_iter=10000)                               
regressor = regressor.fit(X_train_vect_avg, y_train.values)

In [62]:
# Use the trained model to make predictions on the test data
y_pred = regressor.predict(X_test_vect_avg)

In [63]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.746 / Recall: 0.732 / Accuracy: 0.735


#### word2vec

In [65]:
file_path = "GoogleNews-vectors-negative300.bin"

# Load into gensim
w2vec = gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=True)

In [66]:
words = set(w2vec.index_to_key )
#For each sequence of words in X_train, the list comprehension [model[i] for i in ls if i in words] iterates through each word (i) in the sequence ls and retrieves its word vector using model[i].
X_train_vect = pad_sequences([[w2vec[i] for i in ls if i in words] for ls in X_train], dtype='float32', padding='post')
X_test_vect = pad_sequences([[w2vec[i] for i in ls if i in words] for ls in X_test], dtype='float32', padding='post')

In [67]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [68]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression(max_iter=10000)                               
regressor = regressor.fit(X_train_vect_avg, y_train.values)

In [69]:
# Use the trained model to make predictions on the test data
y_pred = regressor.predict(X_test_vect_avg)

In [70]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.804 / Recall: 0.793 / Accuracy: 0.795
