In [None]:
import numpy as np
import pandas as pd
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from keras.preprocessing.text import Tokenizer
from keras.layers import Input, Dropout, Dense, concatenate, GRU, Embedding, Flatten, Activation
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K

In [None]:
%%time

train_df = pd.read_csv('train.csv') #read train dataset
test_df = pd.read_csv('test.csv') # read test dataset
print(train_df.shape, test_df.shape)

In [None]:
#Eliminate all the outliers in the PRODUCT_LENGTH column using IQR method
outliers = []
def detect_outliers_iqr(data):
    median = data['PRODUCT_LENGTH'].median()

    q1 = np.percentile(data['PRODUCT_LENGTH'], 25)
    q3 = np.percentile(data['PRODUCT_LENGTH'], 75)
    IQR = q3-q1
    lwr_bound = q1-(1.5*IQR)
    upr_bound = q3+(1.5*IQR)
    count =0
    for i in range(len(data['PRODUCT_LENGTH'])): 
        if (data.iat[i,-1]<lwr_bound or data.iat[i,-1]>upr_bound):
            data.iat[i,-1] = median
            count = count +1 
    return count
count = detect_outliers_iqr(train_df)
print("Outliers from IQR method: ", count)


In [None]:
# Handle missing data.
def fill_missing_values(df):
    df.TITLE.fillna(value="Missing", inplace=True)
    df.BULLET_POINTS.fillna(value="Missing", inplace=True)
    df.DESCRIPTION.fillna(value="Missing", inplace=True)
    return df

train_df = fill_missing_values(train_df)
test_df = fill_missing_values(test_df)

In [None]:
from string import punctuation
punctuation

In [None]:
punctuation_symbols = []
for symbol in punctuation:
    punctuation_symbols.append((symbol, ''))
    
punctuation_symbols

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop

In [None]:
import string

# Create a function to remove punctuations
def remove_punctuation(sentence: str) -> str:
    return sentence.translate(str.maketrans('', '', string.punctuation))

# Create a function to remove stop words
def remove_stop_words(x):
    x = ' '.join([i for i in x.lower().split(' ') if i not in stop])
    return x

# Create a function to lowercase the words
def to_lower(x):
    return x.lower()

# Create a function to remove emojis
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
# Scale target variable to log.
train_df['target'] = np.log1p(train_df['PRODUCT_LENGTH'])

Y_train = train_df.target.values.reshape(-1,1)

# Calculate number of train/dev/test examples.
n_trains = train_df.shape[0]
print("Training on", n_trains, "examples")

In [None]:
full_df = pd.concat([train_df, test_df]) #combine the train and test datasets

In [None]:
%%time
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
full_df['DESCRIPTION'] = full_df['DESCRIPTION'].apply(porter.stem)
full_df['DESCRIPTION'] = full_df['DESCRIPTION'].apply(remove_emoji)
full_df['DESCRIPTION'] = full_df['DESCRIPTION'].apply(remove_punctuation)
full_df['DESCRIPTION'] = full_df['DESCRIPTION'].apply(remove_stop_words)
full_df['DESCRIPTION'] = full_df['DESCRIPTION'].apply(to_lower)

full_df['TITLE'] = full_df['TITLE'].apply(remove_punctuation)
full_df['TITLE'] = full_df['TITLE'].apply(remove_emoji)
full_df['TITLE'] = full_df['TITLE'].apply(remove_stop_words)
full_df['TITLE'] = full_df['TITLE'].apply(to_lower)

full_df['BULLET_POINTS'] = full_df['BULLET_POINTS'].apply(porter.stem)
full_df['BULLET_POINTS'] = full_df['BULLET_POINTS'].apply(remove_emoji)
full_df['BULLET_POINTS'] = full_df['BULLET_POINTS'].apply(remove_punctuation)
full_df['BULLET_POINTS'] = full_df['BULLET_POINTS'].apply(remove_stop_words)
full_df['BULLET_POINTS'] = full_df['BULLET_POINTS'].apply(to_lower)

In [None]:
full_df.drop('PRODUCT_ID', axis=1, inplace=True) #drop PRODUCT_ID column

In [None]:
%%time

# Convert data type to string
full_df['PRODUCT_TYPE_ID'] = full_df['PRODUCT_TYPE_ID'].astype(str)

In [None]:
%%time
#Vectorization of the data in the columns
print("Vectorizing data...")
default_preprocessor = CountVectorizer().build_preprocessor()
def build_preprocessor(field):
    field_idx = list(full_df.columns).index(field)
    return lambda x: default_preprocessor(x[field_idx])

vectorizer = FeatureUnion([
    ('TITLE', CountVectorizer(
        ngram_range=(1, 3),
        max_features=None,
        preprocessor=build_preprocessor('TITLE'))),
    ('PRODUCT_TYPE_ID', CountVectorizer(
        token_pattern='\d+',
        preprocessor=build_preprocessor('PRODUCT_TYPE_ID'))),
    ('DESCRIPTION', TfidfVectorizer(
        ngram_range=(1, 3),
        max_features=None,
        preprocessor=build_preprocessor('DESCRIPTION'))),
    ('BULLET_POINTS', TfidfVectorizer(
        ngram_range=(1, 3),
        max_features=None,
        preprocessor=build_preprocessor('BULLET_POINTS'))),
])

X = vectorizer.fit_transform(full_df.values) #sparse matrix containing all the vectorized columns

In [None]:
# split the combined sparse matrix into train and test
X_train = X[:n_trains] 
X_test = X[n_trains:]

In [None]:
print(X.shape, X_train.shape, X_test.shape, Y_train.shape)

In [None]:
%%time
#Train a Ridge Regression model on the obtained X_train and Y_train 
print("Fitting Ridge model on training examples...")
ridge_model = Ridge(
    solver='auto', fit_intercept=True, alpha=15,
    max_iter=60, normalize=False, tol=0.05,
)
ridge_model.fit(X_train, Y_train)

In [None]:
%%time

ridge_preds = ridge_model.predict(X_test) #predict X_test using the trained Ridge model
ridge_preds = np.expm1(ridge_preds) # perform anti-log

In [None]:
ridge_preds # is the final 2x2 array containing the PRODUCT_LENGTHs of the test dataset
test_df['PRODUCT_LENGTH'] = ridge_preds
test_df['PRODUCT_LENGTH'] = test_df['PRODUCT_LENGTH'].apply(pd.Series).astype(int) #convert the array into int
test_df.drop(['TITLE', 'BULLET_POINTS', 'DESCRIPTION', 'PRODUCT_TYPE_ID'], axis=1, inplace=True) 
test_df.to_csv('final_predicted_lengths.csv')
test_df