In [4]:
from gensim.models import word2vec
import nltk
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
def preprocessor(doc):
    doc = doc.lower().strip()
    try:
        doc = doc.replace('.', '. ')
        doc = doc.replace(',', ', ')
        doc = doc.replace('!', '! ')
    except:
        pass
    doc = re.sub(r"[^a-z\s]", "", doc)
    doc = [d for d in doc.split() if d not in stop_words]
    doc = ' '.join(doc)
    return doc

In [11]:
path = r'C:\Users\HP\Documents\sentiment_labelled_sentences'
websites = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']
df = pd.DataFrame()
for website in websites:
    website_df = pd.read_csv(f'{path}\{website}', delimiter='\t', header = None)
    df = pd.concat([df, website_df], axis=0)

In [13]:
stop_words = nltk.corpus.stopwords.words('english')

In [14]:
df.columns = ['Review', 'Sentiment']
df['cleaned_review'] = df['Review'].apply(preprocessor)

In [15]:
df

Unnamed: 0,Review,Sentiment,cleaned_review
0,So there is no way for me to plug it in here i...,0,way plug us unless go converter
1,"Good case, Excellent value.",1,good case excellent value
2,Great for the jawbone.,1,great jawbone
3,Tied to charger for conversations lasting more...,0,tied charger conversations lasting minutes maj...
4,The mic is great.,1,mic great
...,...,...,...
995,I think food should have flavor and texture an...,0,think food flavor texture lacking
996,Appetite instantly gone.,0,appetite instantly gone
997,Overall I was not impressed and would not go b...,0,overall impressed would go back
998,"The whole experience was underwhelming, and I ...",0,whole experience underwhelming think well go n...


In [39]:
sentences = [ review.split() for review in df['cleaned_review']]
sentences

[['way', 'plug', 'us', 'unless', 'go', 'converter'],
 ['good', 'case', 'excellent', 'value'],
 ['great', 'jawbone'],
 ['tied',
  'charger',
  'conversations',
  'lasting',
  'minutes',
  'major',
  'problems'],
 ['mic', 'great'],
 ['jiggle', 'plug', 'get', 'line', 'right', 'get', 'decent', 'volume'],
 ['several',
  'dozen',
  'several',
  'hundred',
  'contacts',
  'imagine',
  'fun',
  'sending',
  'one',
  'one'],
 ['razr', 'owner', 'must'],
 ['needless', 'say', 'wasted', 'money'],
 ['waste', 'money', 'time'],
 ['sound', 'quality', 'great'],
 ['impressed', 'going', 'original', 'battery', 'extended', 'battery'],
 ['two',
  'seperated',
  'mere',
  'ft',
  'started',
  'notice',
  'excessive',
  'static',
  'garbled',
  'sound',
  'headset'],
 ['good', 'quality', 'though'],
 ['design', 'odd', 'ear', 'clip', 'comfortable'],
 ['highly', 'recommend', 'one', 'blue', 'tooth', 'phone'],
 ['advise', 'everyone', 'fooled'],
 ['far', 'good'],
 ['works', 'great'],
 ['clicks',
  'place',
  'way',


In [40]:
word_vec = word2vec.Word2Vec(sentences, epochs=10, sg=1, vector_size=100, min_count=1, window=5, workers=1)

In [41]:
words = word_vec.wv.index_to_key

In [42]:
word_vectors = word_vec.wv.vectors

In [43]:
word_vec.wv['good']

array([-0.1787139 ,  0.3056883 , -0.04254503, -0.07712426, -0.01111287,
       -0.38944712,  0.05141916,  0.34297985, -0.10486433, -0.05746172,
       -0.10857052, -0.33200043, -0.16236775, -0.00448773,  0.0596169 ,
       -0.27017242,  0.10576506, -0.19540456,  0.021986  , -0.39688575,
        0.14277123,  0.06689044,  0.2781318 , -0.06398668, -0.10474209,
       -0.07686445, -0.09879038, -0.2059773 , -0.18512084, -0.11451481,
        0.25658348,  0.00189977,  0.1015505 , -0.24428739, -0.02582659,
        0.21442822, -0.00852771, -0.23596604, -0.1708101 , -0.31888554,
       -0.00376306, -0.11855454,  0.0277666 ,  0.04667661,  0.11895036,
       -0.28573948, -0.27838933, -0.01558106,  0.12706248,  0.15734819,
        0.16904913, -0.11107089, -0.05790695, -0.04600906, -0.10796509,
        0.10193103,  0.25038445, -0.06972845, -0.12999079,  0.08578619,
        0.18825158,  0.0494577 , -0.1255798 ,  0.02284877, -0.14551763,
        0.1372221 ,  0.04829791,  0.22278912, -0.33159915,  0.35

In [44]:
vocab_size, vec_size = word_vectors.shape

In [45]:
word_vec.wv.most_similar('good')

[('terrible', 0.9982855319976807),
 ('amazing', 0.9980572462081909),
 ('thought', 0.9980465173721313),
 ('everything', 0.997978687286377),
 ('kind', 0.9979666471481323),
 ('felt', 0.9979220032691956),
 ('since', 0.9979216456413269),
 ('horrible', 0.9978972673416138),
 ('find', 0.9978780150413513),
 ('thats', 0.9978727698326111)]

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X = df['cleaned_review']
y = df['Sentiment']

In [56]:
X_train, X_test, y_train, y_text = train_test_split(X, y, random_state=1, test_size=0.2)

In [49]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [50]:
tokenizer.word_index

{'good': 1,
 'great': 2,
 'movie': 3,
 'phone': 4,
 'film': 5,
 'one': 6,
 'like': 7,
 'food': 8,
 'time': 9,
 'place': 10,
 'really': 11,
 'service': 12,
 'bad': 13,
 'well': 14,
 'best': 15,
 'would': 16,
 'even': 17,
 'also': 18,
 'dont': 19,
 'ever': 20,
 'back': 21,
 'ive': 22,
 'quality': 23,
 'love': 24,
 'get': 25,
 'made': 26,
 'go': 27,
 'work': 28,
 'im': 29,
 'better': 30,
 'use': 31,
 'never': 32,
 'product': 33,
 'could': 34,
 'nice': 35,
 'recommend': 36,
 'works': 37,
 'excellent': 38,
 'much': 39,
 'didnt': 40,
 'sound': 41,
 'headset': 42,
 'battery': 43,
 'think': 44,
 'make': 45,
 'pretty': 46,
 'first': 47,
 'way': 48,
 'acting': 49,
 'still': 50,
 'see': 51,
 'got': 52,
 'worst': 53,
 'little': 54,
 'going': 55,
 'minutes': 56,
 'characters': 57,
 'case': 58,
 'disappointed': 59,
 'waste': 60,
 'every': 61,
 'money': 62,
 'ear': 63,
 'say': 64,
 'enough': 65,
 'people': 66,
 'right': 67,
 'definitely': 68,
 'price': 69,
 'look': 70,
 'thing': 71,
 'amazing': 72,
 

In [57]:
X_train = tokenizer.texts_to_sequences(X_train)


X_train

[[624, 1787, 61, 326, 811, 812, 207, 42],
 [123, 3, 136, 1788],
 [11, 1, 33],
 [1789, 25, 2, 292, 54, 625, 231, 2],
 [12, 117, 369, 77],
 [813, 24, 432],
 [433, 7, 1127],
 [626, 1790, 62, 1791, 107],
 [627, 63, 1128, 1129, 15],
 [187],
 [1792, 43, 522, 1793, 232, 628],
 [1130, 77],
 [1131, 327, 233, 434],
 [42, 1794, 1795, 110, 169],
 [208, 1132, 124, 118, 54, 1796, 80, 1797, 1798],
 [2, 629, 1799, 1800, 814, 435, 293, 170, 31, 1801, 1802, 814, 1803, 815],
 [294, 816, 1, 209, 1804, 1133, 1134],
 [125],
 [171, 44, 1135, 523, 1805, 1136, 56],
 [17, 1137, 630, 111, 1806, 628, 524, 1138, 1807, 1139, 234, 1808],
 [40, 45, 817, 1809, 45, 1810, 631, 57, 235, 436],
 [1811, 1812, 818],
 [295, 209, 210],
 [81, 632, 3],
 [328, 211, 1813, 1814, 525],
 [633, 296, 49, 634, 1815, 1816],
 [44, 5, 7, 1817],
 [4, 1140, 297],
 [81, 28, 67, 1818],
 [73, 1141, 1142, 153, 1819],
 [1820, 1821, 635, 100, 1822, 1823, 1824],
 [298, 329, 153, 1825, 1143, 50, 1826],
 [636, 370],
 [78, 526],
 [68, 188, 21],
 [371,

In [58]:
X_test = tokenizer.texts_to_sequences(X_test)

In [59]:
X_train = pad_sequences(X_train, maxlen=vec_size, padding='post')
X_test = pad_sequences(X_test, maxlen=vec_size, padding='post')

In [60]:
X_train

array([[ 624, 1787,   61, ...,    0,    0,    0],
       [ 123,    3,  136, ...,    0,    0,    0],
       [  11,    1,   33, ...,    0,    0,    0],
       ...,
       [4511, 4512,   56, ...,    0,    0,    0],
       [   6,  621, 4514, ...,    0,    0,    0],
       [ 130,    5,   11, ...,    0,    0,    0]])

In [62]:
token_size = len(tokenizer.word_index) + 1

In [63]:
token_size

4515

In [66]:
embedding_matrix = np.zeros((token_size, vec_size))
for word, index in tokenizer.word_index.items():
    if word in word_vec.wv:
        embedding_matrix[index] = word_vec.wv[word]

In [67]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.1787139 ,  0.30568829, -0.04254503, ..., -0.29198012,
         0.07435001,  0.04514116],
       [-0.19080791,  0.2985602 , -0.0466175 , ..., -0.27923334,
         0.05803493,  0.03060061],
       ...,
       [-0.03271025,  0.04319309, -0.00036801, ..., -0.03965411,
         0.006022  ,  0.001802  ],
       [-0.01484915,  0.03210239, -0.01508052, ..., -0.03230778,
         0.01149274, -0.00370731],
       [-0.0189266 ,  0.02740377, -0.00611615, ..., -0.02279465,
         0.01535584, -0.00508766]])