- Preprocesses the data and modelled the given using the multi-label classification (A text can be two/more than two classes at same time)
- Linear models worked well on this dataset, Logistic with OnevsRest Classifier gave the best results with multi-label accuarcy around 60%.
- Tried Bert for multi-label classification, but it seems to struck at local minima and accuarcy is struck at multi-label accuracy 26%.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
path_dict = {}
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path_dict[filename] = os.path.join(dirname, filename)

# Any results you write to the current directory are saved as output.
path_dict

{'Sample_Submission.csv': '/kaggle/input/Sample_Submission.csv',
 'train.csv': '/kaggle/input/train.csv',
 'test.csv': '/kaggle/input/test.csv'}

In [2]:
train_df = pd.read_csv(path_dict['train.csv'])
test_df = pd.read_csv(path_dict['test.csv'])
pred_df = test_df.copy()
train_df.shape, test_df.shape

((5959, 3), (2553, 2))

In [4]:
train_df['Review Title'] = train_df['Review Title'] + ' '
train_df['text'] = 3*train_df['Review Title'] + train_df['Review Text']

test_df['Review Title'] = test_df['Review Title'] + ' '
test_df['text'] = 3*test_df['Review Title'] + test_df['Review Text']

In [5]:
train_multi_label = train_df.groupby('text')['topic'].apply(lambda x: '|'.join(x)).reset_index()
print('Total combinations of multi-labels in the train using the 21 classes:', 
      len(train_multi_label['topic'].value_counts().to_dict()))

Total combinations of multi-labels in the train using the 21 classes: 431


In [7]:
test_multi_label = test_df.groupby('text').count().reset_index()
test_multi_label['review_count'] = test_multi_label['Review Text']
test_multi_label = test_multi_label[['text', 'review_count']]

In [8]:
train_multi_label.shape, test_multi_label.shape

((4217, 2), (1776, 2))

In [143]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

def preprocess_text(text):
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))

    sent = decontracted(text)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9:]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    
    words = sent.split(' ')
    words = [word.lower() for word in words]
    sent = ' '.join(e for e in words if e not in stopwords)
    return sent.lower().strip()

train_multi_label['text'] = train_multi_label['text'].progress_apply(lambda x: preprocess_text(x))
test_multi_label['text'] = test_multi_label['text'].progress_apply(lambda x: preprocess_text(x))
test_df['text'] = test_df['text'].progress_apply(lambda x: preprocess_text(x))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4217), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1776), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2553), HTML(value='')))




In [35]:
X_train_val = train_multi_label[['text']].copy()
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer
lb = MultiLabelBinarizer()
y_train_val = lb.fit_transform(train_multi_label['topic'].apply(lambda x:x.split('|')))
num_classes = len(lb.classes_)
num_classes, X_train_val.shape, y_train_val.shape

(21, (4217, 1), (4217, 21))

In [57]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, ShuffleSplit
from sklearn.metrics import f1_score
from time import time
import scipy

def build_model_pipeline(model):
    
    ## categorical features and ## numerical features
    # nontext_pipe = get_nontext_pipeline()
    
    # text_pipes
    text_vect1 = TfidfVectorizer(ngram_range = (1,2), min_df = 1, max_df=0.95, 
                                 norm ="l1")
    text1 = make_column_transformer((text_vect1, 'text'))
    
    text_vect2 = TfidfVectorizer(ngram_range = (1,5), min_df = 1, max_df=0.95, 
                                 analyzer="char", norm ="l1")
    text2 = make_column_transformer((text_vect2, 'text'))
    
    
    # feature union
    feature_pipe = FeatureUnion(transformer_list = [('text_feat1', text1), ('text_feat2', text2)])
    
    classification_pipeline = Pipeline(steps = [('feat_union', feature_pipe), ('model', model)])
    print('done! model pipeline ready...')
    
    return classification_pipeline



def random_search(model_pipeline, param, X, y, cv = 3, search_iter = 10,):
    
    grid_search = RandomizedSearchCV(model_pipeline, param, scoring = 'accuracy', n_jobs=-1,
                                     n_iter = search_iter, 
                                     cv = cv, verbose=1)
    t0 = time()
    grid_search = grid_search.fit(X, y)
    print("done in %0.3fs" % (time() - t0))
    print()
    print("Best acc: %0.4f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(param.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    return grid_search

In [58]:
sgd_log = OneVsRestClassifier(SGDClassifier(loss='log', n_jobs = -1, penalty = 'l1'))
parameters = {'model__estimator__alpha': [0.00001, 0.00005, 0.00007, 0.00009, 0.0001, 0.0007, 0.0005, 0.001, 0.005,
                                   0.01, 0.05, 0.1, 0.5,1,10,100,1000,10000]}
sgd_pipeline = build_model_pipeline(model = sgd_log)

best_sgd_log_model = random_search(sgd_pipeline, parameters, X_train_val, y_train_val,
                                   cv = 3, search_iter = 16)

done! model pipeline ready...
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.3min finished


done in 211.628s

Best acc: 0.4842
Best parameters set:
	model__estimator__alpha: 1e-05


In [60]:
from datetime import datetime
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(LogisticRegression(penalty='l1', 
                                               class_weight = 'balanced'))
parameters = {'model__estimator__C': [0.00001, 0.00005, 0.00007, 0.00009, 0.0001, 0.0007, 0.0005, 0.001, 0.005,
                                   0.01, 0.05, 0.1, 0.5,1,10,100,1000,10000]}
logistic_pipeline = build_model_pipeline(model = model)
best_logistic_model = random_search(logistic_pipeline, parameters, X_train_val, y_train_val,
                                   cv = 3, search_iter = 16)

done! model pipeline ready...
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  4.4min finished


done in 330.433s

Best acc: 0.5784
Best parameters set:
	model__estimator__C: 1000


In [64]:
X_test = test_multi_label[['text']].copy()
y_test_preds = best_logistic_model.predict(X_test)
y_test_probas = best_logistic_model.predict_proba(X_test)

In [86]:
y_test_labels = lb.inverse_transform(y_test_probas > 0.1)
test_predictions = ['|'.join(pr) for pr in y_test_labels]

In [97]:
top_6_predictions = []
for proba in y_test_probas:
    top_6_predictions.append(lb.classes_[np.argsort(proba)[-6:]])  # from back

In [102]:
test_row_preds = []
for count, prediction in zip(test_multi_label['review_count'].values, top_6_predictions):
    row_prediction = '|'.join(prediction[-count:])
    test_row_preds.append(row_prediction)

In [138]:
def get_preds(test_multi_label):
    df1 = test_multi_label.merge(test_multi_label.topics.str.split('|',expand=True),
                    left_index=True, right_index=True, how='outer')
    df1.drop('topics',axis=1,inplace=True)
    df2 = df1.melt(['text'], value_vars = [0, 1, 2, 3, 4, 5])
    res_df = df2[df2['value'].isin(lb.classes_)]
    return res_df

In [146]:
test_multi_label['topics'] = test_row_preds
res_df = get_preds(test_multi_label)
res_df.head()

Unnamed: 0,text,variable,value
0,x brand prime delivery made huge mess x brand ...,0,Shipment and delivery
1,x brand problem x brand problem x brand proble...,0,Wrong Product received
2,10 price increase 35 45 within last purchase 1...,0,Not Effective
3,really liked awhile order set pill organizer n...,0,Bad Taste/Flavor
4,0 0 0 taste gross,0,Bad Taste/Flavor


In [148]:
res_df.shape, res_df.columns

((2553, 3), Index(['text', 'variable', 'value'], dtype='object'))

In [162]:
res_pred_df = test_df.merge(res_df[['text', 'value']], how='left', on = 'text')

In [167]:
res_pred_df = res_pred_df.drop_duplicates().reset_index(drop = 1)
res_pred_df.shape, test_df.shape

((2553, 4), (2553, 3))

In [168]:
res_pred_df.head()

Unnamed: 0,Review Text,Review Title,text,value
0,I use chia seed in my protein shakes. These ta...,Bad tast,bad tast bad tast bad tast use chia seed prote...,Bad Taste/Flavor
1,I use chia seed in my protein shakes. These ta...,Bad tast,bad tast bad tast bad tast use chia seed prote...,Quality/Contaminated
2,Don’t waste your money.,No change. No results.,change results change results change results w...,Not Effective
3,I use the book 'Fortify Your Life' by Tieraona...,"Good Vegan Choice, Poor Non Vegan Choice",good vegan choice poor non vegan choice good v...,Allergic
4,I use the book 'Fortify Your Life' by Tieraona...,"Good Vegan Choice, Poor Non Vegan Choice",good vegan choice poor non vegan choice good v...,Ingredients


In [169]:
pred_df['topic'] = res_pred_df['value']

In [171]:
print(pred_df.shape, pred_df.columns)
pred_df['topic'].value_counts()

(2553, 3) Index(['Review Text', 'Review Title', 'topic'], dtype='object')


Bad Taste/Flavor           538
Quality/Contaminated       300
Not Effective              268
Allergic                   242
Packaging                  212
Texture                    182
Shipment and delivery      159
Customer Service            99
Color and texture           97
Ingredients                 89
Too big to swallow          85
Expiry                      70
Smells Bad                  59
Too Sweet                   53
Wrong Product received      33
Pricing                     33
False Advertisement         16
Inferior to competitors      9
Didn't Like                  5
Hard to Chew                 3
Customer Issues              1
Name: topic, dtype: int64

In [172]:
filename = '4-amazon-reviews-ml.csv'
print(len(test_df), len(pred_df))
pred_df.to_csv(filename, index = None)
from IPython.display import FileLink
FileLink(filename)

2553 2553


In [100]:
# for proba in y_test_probas[:5]:
#     print(np.argsort(proba))

In [73]:
# y_test_labels = lb.inverse_transform(y_test_preds)
# test_predictions = ['|'.join(pr) for pr in y_test_labels]