In [12]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense , Dropout
from tensorflow.keras.models import Model 
from keras.callbacks import Callback , EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,  mean_absolute_percentage_error


class EpochPrintCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"Epoch {epoch+1} completed")

In [21]:
cleaned_train_df=pd.read_csv("C:\\Users\\thesp\\Desktop\\Amazon ML\\Without Z outlier\\cleaned_train.csv",nrows=500000)
cleaned_train_df.head()



Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH,TEXT
0,1925202,1650,2125.98,artzfolio tulip flowers blackout curtain door ...
1,2673191,2755,393.7,marks spencer girls pyjama sets t86_2561c_navy...
2,2765088,7537,748.031495,priknik horn red electric air horn compressor ...
3,1594019,2996,787.401574,alishah womens cotton ankle length leggings co...
4,283658,6112,598.424,the united empire loyalists a chronicle great ...


In [22]:
# Define target variable
target = 'PRODUCT_LENGTH'

# Define features to use for training
features = ['PRODUCT_ID','PRODUCT_TYPE_ID', 'TEXT']


# Split the data into training, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(cleaned_train_df[features], cleaned_train_df[target], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)


In [23]:
# Create a tokenizer and fit it on the input descriptions
X_train.dtypes
X_train['TEXT'] = X_train['TEXT'].astype('str')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['TEXT'].values)

X_val.dtypes
X_val['TEXT'] = X_val['TEXT'].astype('str')

tokenizer.fit_on_texts(X_val['TEXT'].values)

In [24]:
# Convert the input descriptions to sequences
description_sequences = tokenizer.texts_to_sequences(X_train['TEXT'].values)
description_sequences2 = tokenizer.texts_to_sequences(X_val['TEXT'].values)

In [25]:
# Pad the input sequences to make them all the same length
max_length = 1000
padded_description_sequences = pad_sequences(description_sequences, maxlen=max_length, padding='post')
padded_description_sequences2 = pad_sequences(description_sequences2, maxlen=max_length, padding='post')

In [26]:
# Store arrays in variables
lengths = y_train.values
product_ids = X_train['PRODUCT_ID'].values
product_type_ids = X_train['PRODUCT_TYPE_ID'].values

lengths2 = y_val.values
product_ids2 = X_val['PRODUCT_ID'].values
product_type_ids2 = X_val['PRODUCT_TYPE_ID'].values

In [30]:
# Define the model
description_input = Input(shape=(max_length,))
product_type_id_input = Input(shape=(1,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32, input_length=max_length)(description_input)
flatten_layer = Flatten()(embedding_layer)
concat_layer = Concatenate()([flatten_layer,   product_type_id_input])
dropout_layer = Dropout(0.2)(concat_layer)
dense_layer_1 = Dense(64, activation='relu')(dropout_layer)
dense_layer_2 = Dense(32, activation='relu')(dense_layer_1)
output_layer = Dense(1, activation='linear')(dense_layer_2)
model = Model(inputs=[description_input,   product_type_id_input], outputs=output_layer)

# Compile the model with the loss function and optimizer
model.compile(loss='mse', optimizer='adam')

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min', restore_best_weights=True)

# Train the model on the input data
model.fit([padded_description_sequences, product_type_ids], lengths, validation_data=([padded_description_sequences2, product_type_ids2], lengths2), epochs=100, batch_size=64, verbose=0, callbacks=[EpochPrintCallback(), early_stopping])


Epoch 1 completed
Epoch 2 completed
Epoch 3 completed
Epoch 4 completed
Epoch 5 completed
Epoch 6 completed
Epoch 7 completed
Epoch 8 completed
Epoch 9 completed
Epoch 10 completed
Epoch 11 completed
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping


<keras.callbacks.History at 0x1b7d3e54c10>

In [29]:

#model.save('C:\\Users\\thesp\\Desktop\\Amazon ML\\Without Z outlier\\model-500k')
#model= tf.keras.models.load_model('model-10k')
print(X_train.shape[0])


320000


In [32]:
# Use the model to predict the length of the test data

X_test.dtypes
X_test['TEXT'] = X_test['TEXT'].astype('str')
new_description_sequences = tokenizer.texts_to_sequences(X_test['TEXT'].values)
padded_new_description_sequences = pad_sequences(new_description_sequences, maxlen=max_length, padding='post')
new_product_type_ids = X_test['PRODUCT_TYPE_ID'].values.reshape(-1, 1)
predictions = model.predict([padded_new_description_sequences, new_product_type_ids]).flatten()

# Evaluate the model on the test data


score = max(0, 100 * (1 - mean_absolute_percentage_error(y_test.values, predictions)))
print(f"Accuracy score: {score:.2f}%")

Accuracy score: 0.00%
