In [None]:
#import required libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from sklearn.metrics import mean_absolute_error

# Load the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Remove duplicates
train_data.drop_duplicates(subset=['PRODUCT_ID'], inplace=True)

# Handle missing values
train_data.fillna('', inplace=True)
test_data.fillna('', inplace=True)

# Remove outliers
train_data = train_data[train_data['PRODUCT_LENGTH'] > 0]

# Preprocess text data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^A-Za-z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

train_data['TITLE'] = train_data['TITLE'].apply(preprocess_text)
train_data['DESCRIPTION'] = train_data['DESCRIPTION'].apply(preprocess_text)
train_data['BULLET_POINTS'] = train_data['BULLET_POINTS'].apply(preprocess_text)

test_data['TITLE'] = test_data['TITLE'].apply(preprocess_text)
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].apply(preprocess_text)
test_data['BULLET_POINTS'] = test_data['BULLET_POINTS'].apply(preprocess_text)

# Feature engineering
from scipy import sparse

vectorizer = TfidfVectorizer(max_features=500, dtype=np.float32)
title_features = vectorizer.fit_transform(train_data['TITLE'])
description_features = vectorizer.fit_transform(train_data['DESCRIPTION'])
bullet_points_features = vectorizer.fit_transform(train_data['BULLET_POINTS'])
product_type_features = np.array(train_data['PRODUCT_TYPE_ID']).reshape(-1, 1)
features = sparse.hstack((title_features, description_features, bullet_points_features, product_type_features))

# Scale the features
from sklearn.preprocessing import MaxAbsScaler

scaler = MaxAbsScaler()
features = scaler.fit_transform(features)
# Define the model
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=features.shape[1]))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(loss='mean_absolute_error', optimizer=Adam(learning_rate=0.001))

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', restore_best_weights=True)
#Train the model
history = model.fit(features, train_data['PRODUCT_LENGTH'].values, epochs=40, batch_size=64, callbacks=[early_stopping])

#Evaluate the model on the training data
train_predictions = model.predict(features)
train_mae = mean_absolute_error(train_data['PRODUCT_LENGTH'].values, train_predictions)
print('Training MAE:', train_mae)

# Make predictions on the test set
title_features = vectorizer.transform(test_data['TITLE']).toarray()
description_features = vectorizer.transform(test_data['DESCRIPTION']).toarray()
bullet_points_features = vectorizer.transform(test_data['BULLET_POINTS']).toarray()
product_type_features = np.array(test_data['PRODUCT_TYPE_ID']).reshape(-1, 1)
features = np.concatenate((title_features, description_features, bullet_points_features, product_type_features), axis=1)
features = scaler.transform(features)
predictions = model.predict(features)

# Prepare submission file
submission = test_data[['PRODUCT_ID']].copy()
submission['PRODUCT_LENGTH'] = predictions
submission.to_csv('submission.csv', index=False)

from tqdm import tqdm
from sklearn.metrics import mean_absolute_percentage_error
# import pandas as pd

# Load actual and predicted values
actual = pd.read_csv('/kaggle/input/amazon-product-length-prediction-dataset/dataset/sample_submission.csv')
predicted = pd.read_csv('/kaggle/working/submission.csv')

# Compute the mean absolute percentage error
mape = mean_absolute_percentage_error(actual['PRODUCT_LENGTH'], predicted['PRODUCT_LENGTH'])

# Print the score
score = max(0, 100 * (1 - mape))
print('Score:', score)