In [1]:
## Packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# For text processing 
import string
from bs4 import BeautifulSoup
from sklearn.feature_extraction import _stop_words
Stop_Words= _stop_words.ENGLISH_STOP_WORDS
import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# For sentiment analysis 
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

import warnings
warnings.filterwarnings('ignore') # ignore warnings

np.random.seed(42) # set seed 

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/carolinesofieljorring/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/carolinesofieljorring/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/carolinesofieljorring/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# load data 
category = 'Candy & Chocolate' # define category 
metadata_df = pd.read_csv('data/'+category+'/df_'+category+'.csv')
print('Check category: ', metadata_df.category.unique())
# drop features not used for modeling
metadata_df = metadata_df.drop(columns = ['item','std_rating','category','brand','feature','main_cat','similar_item','details','timestamp'])

Check category:  ['Candy & Chocolate']


In [6]:
metadata_df.num_ratings

0         1.0
1        48.0
2         2.0
3         1.0
4         1.0
         ... 
40663     1.0
40664     2.0
40665     1.0
40666     1.0
40667     1.0
Name: num_ratings, Length: 40668, dtype: float64

In [3]:
## Functions for preprocessing of features
def get_number_also_buy(row):
    number = len(row)
    return number

def get_brand(row, brands):
    if row in brands:
        return row
    else:
        return 'Other'

def get_rank(row):
    if isinstance(row, list):
        if len(row) > 0:
            return row[0]
        else:
            return ''
    else:
        return row

def get_description(row):
    if isinstance(row, list):
        if len(row)>0:
            return row
        else:
            return np.nan
    else:
        return row

def get_length(row):
    if isinstance(row, list):
        if len(row)>0:
            return len(row)
        else:
            return np.nan
    else:
        return len(row)

def text_processing(text):
    # remove punctuation 
    text = "".join([c for c in text 
        if c not in string.punctuation])
    # lowercase
    text = "".join([c.lower() for c in text])
    # remove stopwords
    text = " ".join([w for w in text.split() 
        if w not in Stop_Words])
    # stemming / lematizing (optional)
    text = " ".join([lemmatizer.lemmatize(w) for w in text.split()])
    return text

# sentimental analysis of description using pretrained sentiment model 
def get_sentiment(row):
        compound = sid.polarity_scores(row)['compound']
        return compound 

In [4]:
## Preprocess data
def preprocess_data(metadata_df):   
    df_train, df_test = train_test_split(metadata_df, train_size=0.75) # split data, so we DON'T use test for preprocessing

    # get number of also_buy
    df_train['also_buy'] = df_train['also_buy'].fillna('').apply(get_number_also_buy)
    df_test['also_buy'] = df_test['also_buy'].fillna('').apply(get_number_also_buy)

    # get number of also_view
    df_train['also_view'] = df_train['also_view'].fillna('').apply(get_number_also_buy)
    df_test['also_view'] = df_test['also_view'].fillna('').apply(get_number_also_buy)

    # sales rank information
    df_train['rank'] = df_train['rank'].apply(get_rank).str.replace(',','').str.extract('(\d+|$)')
    df_train['rank'] = pd.to_numeric(df_train['rank'], errors = 'coerce').fillna(0).apply(int)
    df_test['rank'] = df_test['rank'].apply(get_rank).str.replace(',','').str.extract('(\d+|$)')
    df_test['rank'] = pd.to_numeric(df_test['rank'], errors = 'coerce').fillna(0).apply(int)
    # remove samples where rank = 0 (not assigned)
    df_train = df_train[df_train['rank']>0]
    df_test = df_test[df_test['rank']>0]

    # get title length
    df_train['title_length'] = df_train['title'].apply(get_length)
    df_test['title_length'] = df_test['title'].apply(get_length)

    # get description length
    df_train['description_length'] = df_train['description'].apply(get_length)
    df_test['description_length'] = df_test['description'].apply(get_length)

    # clean description
    df_train['description'] = df_train['description'].apply(get_description)
    df_train = df_train.dropna(axis = 0, subset=['description'])
    df_train['description'] = df_train['description'].apply(str)
    df_train['description'] = df_train['description'].str.replace('\n', '')
    df_train['description'] = df_train[['description']].applymap(lambda text: BeautifulSoup(text, 'html.parser').get_text())
    df_train['description'] = df_train['description'].apply(text_processing)
    df_test['description'] = df_test['description'].apply(get_description)
    df_test = df_test.dropna(axis = 0, subset=['description'])
    df_test['description'] = df_test['description'].apply(str)
    df_test['description'] = df_test['description'].str.replace('\n', '')
    df_test['description'] = df_test[['description']].applymap(lambda text: BeautifulSoup(text, 'html.parser').get_text())
    df_test['description'] = df_test['description'].apply(text_processing)
    
    # get sentiment score of description 
    df_train['description_sentiment'] = df_train['description'].apply(get_sentiment) # compound score
    df_test['description_sentiment'] = df_test['description'].apply(get_sentiment) 
    
    # set price=nan to average price 
    temp = df_train[df_train['price'].isna() == False]
    print('number of products with missing price: ', (df_train.shape[0]-temp.shape[0]))
    mean_value = temp['price'].mean()
    df_train['price'] = df_train.apply(lambda row: mean_value if row['price'] != row['price'] else row['price'], axis = 1)
    df_test['price'] = df_test.apply(lambda row: mean_value if row['price'] != row['price'] else row['price'], axis = 1)
    print('number of rows with price set to average price: ', (df_train.price == mean_value).sum())

    # scale 
    features_to_scale = ['avg_rating','num_ratings','rank','also_buy','also_view','price','title_length','description_length']
    scaler = MinMaxScaler()
    df_train[features_to_scale] = scaler.fit_transform(df_train[features_to_scale])
    df_test[features_to_scale] = scaler.transform(df_test[features_to_scale])

    return df_train, df_test

df_train, df_test = preprocess_data(metadata_df)


number of products with missing price:  15575
number of rows with price set to average price:  15575


In [5]:
df_train.to_csv('data/' + category + '/df_train.csv',index=False)
df_test.to_csv('data/' + category + '/df_test.csv',index=False)