In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazonreviews/test.ft.txt.bz2
/kaggle/input/amazonreviews/train.ft.txt.bz2


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import bz2



In [3]:
# Get text and labels from raw file

def get_data(filepath):
    reviews = []
    text = []
    label = []
    with bz2.BZ2File(filepath,"r") as f:
        for line in f:
            x = line.decode()
            x = x.partition("_label__")
            reviews.append(x[2:])
        pass
    f.close()
    for review in reviews:
        label.append(review[0].split()[0])
        text.append(' '.join(review[0].split()[1:]))
        
    return text,label

In [4]:
# Create dataframe

def create_dataframe(filepath):
    text,label = get_data(filepath)
    data = {"text":text,"label":label}
    df = pd.DataFrame(data)
    print("Number of records:",len(df))
    df.dropna(inplace=True) # remove null records
    df.reset_index(drop=True,inplace=True)
    df["label"] = df["label"].astype(int)
    print("Number of records after removing nulls:",len(df))
    
    return df

                                                text  label
0  Stuning even for the non-gamer: This sound tra...      1
1  The best soundtrack ever to anything.: I'm rea...      1
2  Amazing!: This soundtrack is my favorite music...      1
3  Excellent Soundtrack: I truly like this soundt...      1
4  Remember, Pull Your Jaw Off The Floor After He...      1


# Create train dataframe

In [64]:
# Load train data

df = create_dataframe("/kaggle/input/amazonreviews/train.ft.txt.bz2")

# Change labels to 0 and 1
df['label'] = df['label'].apply(lambda x: 0 if x==1 else 1)

df.head()

Number of records: 3600000
Number of records after removing nulls: 3600000


Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,Amazing!: This soundtrack is my favorite music...,1
3,Excellent Soundtrack: I truly like this soundt...,1
4,"Remember, Pull Your Jaw Off The Floor After He...",1


In [65]:
df['label'].value_counts()

1    1800000
0    1800000
Name: label, dtype: int64

In [66]:
class_0_data = df[df['label'] == 0]
class_1_data = df[df['label'] == 1]

sampled_class_0 = class_0_data.sample(n=50000, random_state=42)
sampled_class_1 = class_1_data.sample(n=50000, random_state=42)

extracted_df = pd.concat([sampled_class_0, sampled_class_1], ignore_index=True)

extracted_df = extracted_df.sample(frac=1, random_state=42)

In [67]:
extracted_df['label'].value_counts()

1    50000
0    50000
Name: label, dtype: int64

In [19]:
import tensorflow as tf

In [5]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense, SpatialDropout1D

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [70]:
# Dataset Preprocessing

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


def clean_text(text):
    text=text.lower()
#     text=text.apply(lambda x: re.sub(r'[0-9]+','',x))
#     text=text.apply(lambda x: re.sub(r'@mention',' ',x))
#     text=text.apply(lambda x: re.sub(r'https?:\/\/\S+', ' ',x))
#     text=text.apply(lambda x: re.sub(r"www.\[a-z]?\.?(com)+|[a-z]+\.(com)", ' ',x))
#     text=text.apply(lambda x: re.sub(r"[_\,\>\(\-:\)\\\/\!\.\^\!\:\];='#]",'',x))

    text=re.sub(r'[0-9]+','',text)
    text=re.sub(r'@mention',' ',text)
    text=re.sub(r'https?:\/\/\S+', ' ',text)
    text=re.sub(r"www.\[a-z]?\.?(com)+|[a-z]+\.(com)", ' ',text)
    text=re.sub(r"[_\,\>\(\-:\)\\\/\!\.\^\!\:\]\?;=+'#]",'',text)
    
    # tokenize the sentence
    words = nltk.word_tokenize(text)
    
    ps = PorterStemmer()
    
    words = [ps.stem(word) for word in words if word not in stopwords.words('english')]
    
    text = ' '.join(words)
    
    
    return text

In [71]:
extracted_df.head()

Unnamed: 0,text,label
75721,Leaves you breathless: This is one of the best...,1
80184,Worked perfectly right out of the box: The Key...,1
19864,Sizes too small: I returned the pair I bought ...,0
76699,lovely: i have always been a fan of Lynne Grah...,1
92991,wonderful breakfast: Fast and simple to use if...,1


In [72]:
extracted_df['text'] = extracted_df['text'].apply(clean_text)

# for i, row in extracted_df.iterrows():
#     extracted_df.at[i, 'text'] = clean_text(row['text'])

In [75]:
extracted_df.head()                          

Unnamed: 0,text,label
75721,leav breathless one best singersongwrit cd ive...,1
80184,work perfectli right box keyspan usb serial ad...,1
19864,size small return pair bought size run way sma...,0
76699,love alway fan lynn graham stori fail meset me...,1
92991,wonder breakfast fast simpl use arent use brea...,1


In [78]:
# text tokenization 

tokenizer = Tokenizer()
tokenizer.fit_on_texts(extracted_df['text'])
sequences = tokenizer.texts_to_sequences(extracted_df['text'])

In [80]:
# Vocabulary size (number of unique words in the corpus) basically total number of unique words present in the data
vocab_size = len(tokenizer.word_index) + 1

In [81]:
# Preprocessing for generating padding sequences

max_sent_length = max(len(words) for words in sequences)
sequences_padded = pad_sequences(sequences, padding='post', maxlen=max_sent_length)

In [84]:
Y = np.array(extracted_df['label'])

In [101]:
# splitting the data
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(sequences_padded, Y, test_size=0.10, random_state=42)

In [95]:
import tensorflow_hub as hub
word2vec_model = hub.load("https://tfhub.dev/google/word2vec/2")



embedding_matrix = np.zeros((vocab_size, embedding_feature_vectors))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

URLError: <urlopen error [Errno -3] Temporary failure in name resolution>

In [102]:
# model creation


embedding_feature_vectors = 100
model = Sequential()
model.add(Embedding(vocab_size, embedding_feature_vectors, input_length=max_sent_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=128, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))
model.add(Dense(1, activation='softmax'))

# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 161, 100)          14307700  
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 161, 100)         0         
 lDropout1D)                                                     
                                                                 
 lstm_5 (LSTM)               (None, 100)               80400     
                                                                 
 dense_7 (Dense)             (None, 128)               12928     
                                                                 
 dense_8 (Dense)             (None, 1)                 129       
                                                                 
Total params: 14,401,157
Trainable params: 14,401,157
Non-trainable params: 0
__________________________________________

In [103]:
model.fit(X_train, Y_train, epochs=5, batch_size=64)

Epoch 1/5
  23/1407 [..............................] - ETA: 10:09 - loss: 0.6931 - accuracy: 0.4898

KeyboardInterrupt: 

# Earlier approach is giving me around 50% accuracy which is almost like I can predict it. 

## Therefore, now I am switching to a different approach with a much much smaller dataset size. ( Earlier I was using 1 lakh rows and I am going to use 36000 rows) 

In [6]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    labels = labels[:int(len(labels)*0.01)]
    texts = texts[:int(len(texts)*0.01)]
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('../input/amazonreviews/train.ft.txt.bz2')

In [7]:
#create dataframe

new_df=pd.DataFrame(zip(train_texts,train_labels),columns=['text','label'])
print(new_df.head())

                                                text  label
0  Stuning even for the non-gamer: This sound tra...      1
1  The best soundtrack ever to anything.: I'm rea...      1
2  Amazing!: This soundtrack is my favorite music...      1
3  Excellent Soundtrack: I truly like this soundt...      1
4  Remember, Pull Your Jaw Off The Floor After He...      1


In [8]:
# Dataset Preprocessing

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


for i in range(0, len(new_df)):
    text=re.sub(r'[0-9]+','',new_df['text'][i])
    text=re.sub(r'@mention',' ',new_df['text'][i])
    text=re.sub(r'https?:\/\/\S+', ' ',new_df['text'][i])
    text=re.sub(r"www.\[a-z]?\.?(com)+|[a-z]+\.(com)", ' ',new_df['text'][i])
    text=re.sub(r"[_\,\>\(\-:\)\\\/\!\.\^\!\:\];='#]",'',new_df['text'][i])
    
    text = text.lower()
    
    # tokenize the sentence
    words = nltk.word_tokenize(text)
    
#     ps = PorterStemmer()
    
    words = [word for word in words if word not in stopwords.words('english')]
    
    text = ' '.join(words)
    
    new_df['text'][i] = text


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['text'][i] = text


In [54]:
new_df['text']=new_clean_text(new_df['text'])

In [55]:
print(new_df.head())

                                                text  label
0  stuning even for the nongamer this sound track...      1
1  the best soundtrack ever to anything im readin...      1
2  amazing this soundtrack is my favorite music o...      1
3  excellent soundtrack i truly like this soundtr...      1
4  remember pull your jaw off the floor after hea...      1


In [9]:
vocab_size = 10000


tokenizer = Tokenizer(num_words=vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(new_df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 89779 unique tokens.


In [22]:
# Vocabulary size (number of unique words in the corpus) basically total number of unique words present in the data
# vocab_size = len(tokenizer.word_index) + 1

In [10]:
sequences = tokenizer.texts_to_sequences(new_df['text'].values)

# Preprocessing for generating padding sequences

# max_sent_length = max(len(words) for words in sequences)
max_sent_length = 250
new_sequences_padded = pad_sequences(sequences, maxlen=max_sent_length)

In [11]:
Y = pd.get_dummies(new_df['label']).values

In [67]:
new_sequences_padded

array([[   0,    0,    0, ..., 2263,    5,  321],
       [   0,    0,    0, ...,  159,  147, 3101],
       [   0,    0,    0, ...,  168,    7,  246],
       ...,
       [   0,    0,    0, ...,  240,    1,  562],
       [   0,    0,    0, ..., 1171,    6, 7238],
       [   0,    0,    0, ...,    9,  178, 1059]], dtype=int32)

In [12]:
# splitting the data
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(new_sequences_padded, Y, test_size=0.10, random_state=42)

In [13]:
# creating the model

embedding_feature_vectors = 100

model = Sequential()
model.add(Embedding(vocab_size, embedding_feature_vectors, input_length=new_sequences_padded.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(2, activation='softmax'))


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 100)          1000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 250, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 128)               12928     
                                                                 
 dense_1 (Dense)             (None, 2)                 258       
                                                                 
Total params: 1,093,586
Trainable params: 1,093,586
Non-trainable params: 0
______________________________________________

In [14]:
model.fit(X_train, Y_train, epochs=5, batch_size=64,validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7d199eee5930>

In [15]:
Y_pred = model.predict(X_test)



In [16]:
print(Y_pred)

[[9.92957354e-01 7.04281731e-03]
 [1.15971523e-03 9.98840272e-01]
 [1.00867976e-04 9.99899089e-01]
 ...
 [2.02910975e-02 9.79708970e-01]
 [9.98487830e-01 1.51202874e-03]
 [8.17097843e-01 1.82902053e-01]]


In [22]:
predicted_classes = np.argmax(Y_pred, axis=1)
type(predicted_classes), type(Y_test)

(numpy.ndarray, numpy.ndarray)

In [24]:
Y_test_binary = np.argmax(Y_test, axis=1)

In [25]:
Y_test_binary

array([0, 1, 1, ..., 1, 1, 0])

In [26]:
from sklearn.metrics import classification_report

print(classification_report(Y_test_binary,predicted_classes))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85      1738
           1       0.85      0.89      0.87      1862

    accuracy                           0.86      3600
   macro avg       0.86      0.86      0.86      3600
weighted avg       0.86      0.86      0.86      3600



In [30]:
import pickle

with open('review_semantic_analysis_model.pkl', 'wb') as file:
    pickle.dump(model, file)