## Preprocessing and understanding the data

In [124]:
import pandas as pd
import re, string

In [125]:
data = pd.read_csv('../Data/data.csv')
data.head()

Unnamed: 0,question,topic
0,"Hi! If I sign up for your email list, can I se...",Sales/Promotions
1,I'm going to be out of the country for about a...,Shipping
2,I was wondering if you'd be able to overnight ...,Shipping
3,The Swingline electronic stapler (472555) look...,Shipping
4,I think this cosmetic bag would work great for...,Shipping


In [126]:
data.shape

(5000, 2)

In [127]:
# drop rows with missing values
data.dropna(inplace=True)

data.shape

(5000, 2)

In [128]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [129]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [130]:
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))

In [131]:
def preprocessing(text):
   tokens = [word for word in nltk.word_tokenize(text) if (len(word) > 3 and len(word.strip('Xx/')) > 2 and len(re.sub('\d+', '', word.strip('Xx/'))) > 3) ] 
   tokens = map(str.lower, tokens)
   stems = [stemmer.stem(item) for item in tokens if (item not in stop_words)]
   return stems

In [132]:
data.head()

Unnamed: 0,question,topic
0,"Hi! If I sign up for your email list, can I se...",Sales/Promotions
1,I'm going to be out of the country for about a...,Shipping
2,I was wondering if you'd be able to overnight ...,Shipping
3,The Swingline electronic stapler (472555) look...,Shipping
4,I think this cosmetic bag would work great for...,Shipping


In [133]:
import numpy as np
from sklearn.model_selection import train_test_split

In [134]:
# lets grab all unique values from topic and then use them as our target
target = data['topic'].unique()
target

array(['Sales/Promotions', 'Shipping', 'Product Availability',
       'Product Specifications', 'Omnichannel', 'Product Comparison',
       'Returns & Refunds'], dtype=object)

In [135]:
# split the data into train
train, test = train_test_split(data, test_size=0.2, random_state=42)

train_X = train.drop('topic', axis=1)

In [136]:
train.shape, test.shape

((4000, 2), (1000, 2))

In [137]:
# Lets vectorize the data
from sklearn.feature_extraction.text import TfidfVectorizer

In [138]:
# create the transform
vectorizer = TfidfVectorizer(tokenizer=preprocessing, ngram_range=(1, 2), stop_words=None, min_df=0.001, max_df=0.75, max_features=2500)

train_vectors = vectorizer.fit_transform(train_X['question'])

In [139]:
test_vector = vectorizer.transform(test['question'])

In [140]:
train_df = pd.DataFrame(train_vectors.toarray(), columns=vectorizer.get_feature_names())
train_df = pd.concat([train_df, train['topic']], axis=1)



## select (1) one multi-class classifier (e.g., Naive Bayes, Logistic, Decision Tree, SVM) whose code is provided in class handouts

In [141]:
# Lets use svm to train the model
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [142]:
# create the model, this is a multi-class classification problem
model = SVC(kernel='linear', C=1, gamma='auto', probability=True)

# fit the model
model.fit(train_vectors, train['topic'])

SVC(C=1, gamma='auto', kernel='linear', probability=True)

In [143]:
# predict the test data
pred = model.predict(test_vector)

# calculate the accuracy
accuracy_score(test['topic'], pred)

0.952

In [144]:
# train score vs. test score
model.score(train_vectors, train['topic']), model.score(test_vector, test['topic'])

(0.98725, 0.952)

In [145]:
# Lets use the model to predict the topic of a question
question = "I'm really interested in your special deals. Do you have any offers?"

# preprocess the question
question = preprocessing(question)

# vectorize the question
question_vector = vectorizer.transform([' '.join(question)])

# predict the topic
model.predict(question_vector)

array(['Sales/Promotions'], dtype=object)

## (2) one ensemble classifier whose code is also provided (e.g., Random Forest, XGBoost)

In [146]:
# Lets use an ensemble classifier to improve the accuracy
from sklearn.ensemble import RandomForestClassifier

In [147]:
# create the model
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)

# fit the model
model.fit(train_vectors, train['topic'])

# predict the test data
pred = model.predict(test_vector)

# calculate the accuracy
accuracy_score(test['topic'], pred)

0.847

In [148]:
# train score vs. test score
model.score(train_vectors, train['topic']), model.score(test_vector, test['topic'])

(0.863, 0.847)

In [149]:
# Lets use the model to predict the topic of a question
question = "I'm really interested in your special deals. Do you have any offers?"

# preprocess the question
question = preprocessing(question)

# vectorize the question
question_vector = vectorizer.transform([' '.join(question)])

# predict the topic
model.predict(question_vector)

array(['Product Specifications'], dtype=object)

## (3) one other model of your choice whose code is NOT provided in class handouts (this will require some independent research on your part).

In [150]:
# Lets use MLP to train the model
from sklearn.neural_network import MLPClassifier

In [151]:
# create the model, this is a multi-class classification problem
model = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=500, alpha=0.0001, solver='sgd', verbose=10,  random_state=21, tol=0.000000001)

# fit the model
model.fit(train_vectors, train['topic'])

# predict the test data
pred = model.predict(test_vector)

# calculate the accuracy
accuracy_score(test['topic'], pred)

Iteration 1, loss = 1.93341939
Iteration 2, loss = 1.93116736
Iteration 3, loss = 1.92905598
Iteration 4, loss = 1.92700277
Iteration 5, loss = 1.92535041
Iteration 6, loss = 1.92386400
Iteration 7, loss = 1.92258336
Iteration 8, loss = 1.92146448
Iteration 9, loss = 1.92047054
Iteration 10, loss = 1.91960774
Iteration 11, loss = 1.91878748
Iteration 12, loss = 1.91812209
Iteration 13, loss = 1.91738677
Iteration 14, loss = 1.91674659
Iteration 15, loss = 1.91619460
Iteration 16, loss = 1.91559599
Iteration 17, loss = 1.91506802
Iteration 18, loss = 1.91451537
Iteration 19, loss = 1.91399562
Iteration 20, loss = 1.91348242
Iteration 21, loss = 1.91298342
Iteration 22, loss = 1.91247628
Iteration 23, loss = 1.91196029
Iteration 24, loss = 1.91145128
Iteration 25, loss = 1.91095379
Iteration 26, loss = 1.91044309
Iteration 27, loss = 1.90992209
Iteration 28, loss = 1.90940895
Iteration 29, loss = 1.90889486
Iteration 30, loss = 1.90839753
Iteration 31, loss = 1.90785283
Iteration 32, los



0.955

In [152]:
# train score vs. test score
model.score(train_vectors, train['topic']), model.score(test_vector, test['topic'])

(0.99675, 0.955)

In [153]:
# Lets use the model to predict the topic of a question
question = "I'm really interested in your special deals. Do you have any offers?"

# preprocess the question
question = preprocessing(question)

# vectorize the question
question_vector = vectorizer.transform([' '.join(question)])

# predict the topic
model.predict(question_vector)

array(['Sales/Promotions'], dtype='<U22')

## For each classifier, use four kinds of input feature vectors: (1) TF-IDF vector of tokenized words, (2) TF-IDF vector of n-grams (of range 4-5), (3) word vectors (Glove, Word2Vec, or FastText), and (4) document vectors (Doc2Vec). Train each model using training data, report classification metrics using test data, and summarize the results of all model in a nicely formatted table. Comment on which model will be your preferred choice for text classification for this data and why.

### Part 2.1) NGram Range 4-5

In [168]:
# Lets recreate our input feature with TF-IDF with n-grams (4, 5)
vectorizer_2 = TfidfVectorizer(tokenizer=preprocessing, ngram_range=(4, 5), stop_words=None, min_df=0.001, max_df=0.75, max_features=2500)

train_vectors_2 = vectorizer_2.fit_transform(train_X['question'])

In [169]:
# Repeat the same steps as above
train_df_2 = pd.DataFrame(train_vectors_2.toarray(), columns=vectorizer.get_feature_names())

train_df_2 = pd.concat([train_df_2, train['topic']], axis=1)

test_vector_2 = vectorizer_2.transform(test['question'])



In [170]:
# Create SVC model
model = SVC(kernel='linear', C=1, gamma='auto', probability=True)

# fit the model
model.fit(train_vectors_2, train['topic'])

# predict the test data
pred = model.predict(test_vector_2)

# calculate the accuracy
accuracy_score(test['topic'], pred)

0.629

In [172]:
# train score vs. test score
model.score(train_vectors_2, train['topic']), model.score(test_vector_2, test['topic'])

(0.67175, 0.629)

In [173]:
# Lets use the model to predict the topic of a question
question = "I'm really interested in your special deals. Do you have any offers?"

# preprocess the question
question = preprocessing(question)

# vectorize the question
question_vector = vectorizer_2.transform([' '.join(question)])

# predict the topic
model.predict(question_vector)

array(['Product Specifications'], dtype=object)

In [174]:
# Lets train a Random Forest model
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)

# fit the model
model.fit(train_vectors_2, train['topic'])

# predict the test data
pred = model.predict(test_vector_2)

# calculate the accuracy
accuracy_score(test['topic'], pred)

0.36

In [175]:
# train score vs. test score
model.score(train_vectors_2, train['topic']), model.score(test_vector_2, test['topic'])

(0.3635, 0.36)

In [176]:
# Lets do another model using MLP
model = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=500, alpha=0.0001, solver='sgd', verbose=10,  random_state=21, tol=0.000000001)

# fit the model
model.fit(train_vectors_2, train['topic'])

# predict the test data
pred = model.predict(test_vector_2)

# calculate the accuracy
accuracy_score(test['topic'], pred)

Iteration 1, loss = 1.93511248
Iteration 2, loss = 1.93279064
Iteration 3, loss = 1.93069424
Iteration 4, loss = 1.92868508
Iteration 5, loss = 1.92715366
Iteration 6, loss = 1.92580711
Iteration 7, loss = 1.92468004
Iteration 8, loss = 1.92372471
Iteration 9, loss = 1.92288273
Iteration 10, loss = 1.92218936
Iteration 11, loss = 1.92156053
Iteration 12, loss = 1.92110013
Iteration 13, loss = 1.92056968
Iteration 14, loss = 1.92015155
Iteration 15, loss = 1.91984303
Iteration 16, loss = 1.91950814
Iteration 17, loss = 1.91925256
Iteration 18, loss = 1.91897481
Iteration 19, loss = 1.91873183
Iteration 20, loss = 1.91851507
Iteration 21, loss = 1.91833041
Iteration 22, loss = 1.91814810
Iteration 23, loss = 1.91795320
Iteration 24, loss = 1.91777600
Iteration 25, loss = 1.91761776
Iteration 26, loss = 1.91745708
Iteration 27, loss = 1.91728752
Iteration 28, loss = 1.91713154
Iteration 29, loss = 1.91697267
Iteration 30, loss = 1.91683672
Iteration 31, loss = 1.91666078
Iteration 32, los



0.636

In [177]:
# train accuracy vs. test accuracy
model.score(train_vectors_2, train['topic']), model.score(test_vector_2, test['topic'])

(0.6715, 0.636)

### Part 2.2) Using Word2Vec Word Vectors

In [185]:
# Lets recreate our input feature using Word2Vec
from gensim.models import Word2Vec

# load pre-trained word2vec model
wv = KeyedVectors.load_word2vec_format('../Data/GoogleNews-vectors-negative300.bin', binary=True)
wv.init_sims(replace=True)

  wv.init_sims(replace=True)


In [187]:
# create a function to get the average word2vec
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

# create a function to get the word2vec for each question
def get_word2vec_embeddings(vectors, data, generate_missing=False):
    embeddings = data['question'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                       generate_missing=generate_missing))
    return list(embeddings)



In [188]:
# get the word2vec embeddings for the training data
train_word2vec = get_word2vec_embeddings(wv, train_X)

# get the word2vec embeddings for the test data
test_word2vec = get_word2vec_embeddings(wv, test)

In [189]:
# Train SVC model
model = SVC(kernel='linear', C=1, gamma='auto', probability=True)

# fit the model
model.fit(train_word2vec, train['topic'])

# predict the test data
pred = model.predict(test_word2vec)

# calculate the accuracy
accuracy_score(test['topic'], pred)

0.473

In [190]:
# train score vs. test score
model.score(train_word2vec, train['topic']), model.score(test_word2vec, test['topic'])

(0.489, 0.473)

In [None]:
# Lets use the model to predict the topic of a question
questions = [
    "I'm really interested in your special deals. Do you have any offers?",
    "I'm looking for a new car. What are your best deals?",
    "What kind of payment methods do you accept?",
    "I was wondering if you could ship to my country?"
]

# preprocess the question
questions = [preprocessing(question) for question in questions]

# get the word2vec embeddings for the questions
questions_word2vec = get_word2vec_embeddings(wv, pd.DataFrame({'question': questions}))

# for each question, predict the topic and print the result
for question, topic in zip(questions, model.predict(questions_word2vec)):
    print('Question: {}'.format(' '.join(question)))
    print('Topic: {}'.format(topic) + '\n')

In [193]:
# Lets use the model to predict the topic of a question
questions = [
    "I'm really interested in your special deals. Do you have any offers?",
    "I'm looking for a new car. What are your best deals?",
    "What kind of payment methods do you accept?",
    "I was wondering if you could ship to my country?"
]

# preprocess the question
questions = [preprocessing(question) for question in questions]

# get the word2vec embeddings for the questions
questions_word2vec = get_word2vec_embeddings(wv, pd.DataFrame({'question': questions}))

# for each question, predict the topic and print the result
for question, topic in zip(questions, model.predict(questions_word2vec)):
    print('Question: {}'.format(' '.join(question)))
    print('Topic: {}'.format(topic) + '\n')

Question: realli interest special deal offer
Topic: Product Comparison

Question: look best deal
Topic: Shipping

Question: kind payment method accept
Topic: Shipping

Question: wonder could ship countri
Topic: Shipping



In [194]:
# MLP model
model = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=500, alpha=0.0001, solver='sgd', verbose=10,  random_state=21, tol=0.000000001)

# fit the model
model.fit(train_word2vec, train['topic'])

# predict the test data
pred = model.predict(test_word2vec)

# calculate the accuracy
accuracy_score(test['topic'], pred)

Iteration 1, loss = 1.95914257
Iteration 2, loss = 1.95411485
Iteration 3, loss = 1.94912414
Iteration 4, loss = 1.94512461
Iteration 5, loss = 1.94181083
Iteration 6, loss = 1.93891692
Iteration 7, loss = 1.93643471
Iteration 8, loss = 1.93441969
Iteration 9, loss = 1.93255443
Iteration 10, loss = 1.93098476
Iteration 11, loss = 1.92959066
Iteration 12, loss = 1.92839830
Iteration 13, loss = 1.92738874
Iteration 14, loss = 1.92647979
Iteration 15, loss = 1.92573862
Iteration 16, loss = 1.92503401
Iteration 17, loss = 1.92446304
Iteration 18, loss = 1.92398342
Iteration 19, loss = 1.92351445
Iteration 20, loss = 1.92317142
Iteration 21, loss = 1.92277888
Iteration 22, loss = 1.92254651
Iteration 23, loss = 1.92225749
Iteration 24, loss = 1.92205391
Iteration 25, loss = 1.92186854
Iteration 26, loss = 1.92167379
Iteration 27, loss = 1.92155402
Iteration 28, loss = 1.92143109
Iteration 29, loss = 1.92127000
Iteration 30, loss = 1.92118367
Iteration 31, loss = 1.92108594
Iteration 32, los



0.306

In [195]:
# train accuracy vs. test accuracy
model.score(train_word2vec, train['topic']), model.score(test_word2vec, test['topic'])

(0.33575, 0.306)

In [196]:
questions = [
    "I'm really interested in your special deals. Do you have any offers?",
    "I'm looking for a new car. What are your best deals?",
    "What kind of payment methods do you accept?",
    "I was wondering if you could ship to my country?"
]

# preprocess the question
questions = [preprocessing(question) for question in questions]

# get the word2vec embeddings for the questions
questions_word2vec = get_word2vec_embeddings(wv, pd.DataFrame({'question': questions}))

# for each question, predict the topic and print the result
for question, topic in zip(questions, model.predict(questions_word2vec)):
    print('Question: {}'.format(' '.join(question)))
    print('Topic: {}'.format(topic) + '\n')

Question: realli interest special deal offer
Topic: Product Comparison

Question: look best deal
Topic: Product Comparison

Question: kind payment method accept
Topic: Product Comparison

Question: wonder could ship countri
Topic: Product Comparison



### Part 2.3) Using Doc2Vec

In [None]:
# Lets recreate the input feature using Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [197]:
tags_index = {tag: i for i, tag in enumerate(train['topic'].unique())}

tags_index

{'Shipping': 0,
 'Sales/Promotions': 1,
 'Omnichannel': 2,
 'Returns & Refunds': 3,
 'Product Comparison': 4,
 'Product Specifications': 5,
 'Product Availability': 6}

In [199]:
# create a function to get the doc2vec embeddings
def get_doc2vec_embeddings(vectors, data):
    embeddings = [vectors.infer_vector(question) for question in data['question']]
    return embeddings

In [200]:
# Tag the questions
train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['question'], tags=[tags_index[r['topic']]]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['question'], tags=[tags_index[r['topic']]]), axis=1)

In [202]:
# tqdm
from tqdm import tqdm

In [203]:
# train the doc2vec model
d2v_model = Doc2Vec(dm=1, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
d2v_model.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 4000/4000 [00:00<00:00, 3996478.32it/s]


In [205]:
# utils
from sklearn import utils

In [206]:
# train the model
for epoch in range(30):
    d2v_model.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    d2v_model.alpha -= 0.002
    d2v_model.min_alpha = d2v_model.alpha


100%|██████████| 4000/4000 [00:00<00:00, 4002198.47it/s]
100%|██████████| 4000/4000 [00:00<?, ?it/s]
100%|██████████| 4000/4000 [00:00<00:00, 3997430.55it/s]
100%|██████████| 4000/4000 [00:00<?, ?it/s]
100%|██████████| 4000/4000 [00:00<00:00, 4002198.47it/s]
100%|██████████| 4000/4000 [00:00<00:00, 4003153.42it/s]
100%|██████████| 4000/4000 [00:00<00:00, 4001243.98it/s]
100%|██████████| 4000/4000 [00:00<00:00, 3997430.55it/s]
100%|██████████| 4000/4000 [00:00<00:00, 4000289.94it/s]
100%|██████████| 4000/4000 [00:00<00:00, 3996478.32it/s]
100%|██████████| 4000/4000 [00:00<00:00, 4000289.94it/s]
100%|██████████| 4000/4000 [00:00<00:00, 3998383.22it/s]
100%|██████████| 4000/4000 [00:00<00:00, 3999336.35it/s]
100%|██████████| 4000/4000 [00:00<00:00, 3998383.22it/s]
100%|██████████| 4000/4000 [00:00<00:00, 3998383.22it/s]
100%|██████████| 4000/4000 [00:00<00:00, 4000289.94it/s]
100%|██████████| 4000/4000 [00:00<?, ?it/s]
100%|██████████| 4000/4000 [00:00<00:00, 3996478.32it/s]
100%|████████

In [214]:
# get the doc2vec embeddings for the training data
# train_doc2vec = get_doc2vec_embeddings(d2v_model, train_X)

# the above code gets the following error: TypeError: Parameter doc_words of infer_vector() must be a list of strings (not a single string).
# To fix this, we need to convert the question to a list of strings
train_X2 = train_X.copy()
train_X2['question'] = train_X2['question'].apply(lambda x: x.split())

# get the doc2vec embeddings for the training data
train_doc2vec = get_doc2vec_embeddings(d2v_model, train_X2)

In [216]:
test_X2 = test.copy()
test_X2['question'] = test_X2['question'].apply(lambda x: x.split())

# get the doc2vec embeddings for the test data
test_doc2vec = get_doc2vec_embeddings(d2v_model, test_X2)

In [219]:
# SVC model
model = SVC(kernel='linear', C=1, random_state=0)

# fit model lets using test_X2 and train_doc2vec
model.fit(train_doc2vec, train['topic'])

# predict the test data
pred = model.predict(test_doc2vec)

# calculate the accuracy
accuracy_score(test['topic'], pred)

0.161

In [220]:
# train accuracy vs. test accuracy
model.score(train_doc2vec, train['topic']), model.score(test_doc2vec, test['topic'])

(0.1695, 0.161)

In [221]:
# Lets use the model to predict the topic of a question
questions = [
    "I'm really interested in your special deals. Do you have any offers?",
    "I'm looking for a new car. What are your best deals?",
    "What kind of payment methods do you accept?",
    "I was wondering if you could ship to my country?"
]

# preprocess the question
questions = [preprocessing(question) for question in questions]

# get the doc2vec embeddings for the questions
questions_doc2vec = get_doc2vec_embeddings(d2v_model, pd.DataFrame({'question': questions}))

# for each question, predict the topic and print the result
for question, topic in zip(questions, model.predict(questions_doc2vec)):
    print('Question: {}'.format(' '.join(question)))
    print('Topic: {}'.format(topic) + '\n')

Question: realli interest special deal offer
Topic: Product Specifications

Question: look best deal
Topic: Product Specifications

Question: kind payment method accept
Topic: Product Specifications

Question: wonder could ship countri
Topic: Product Specifications



In [222]:
# Random Forest model
model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

# fit the model
model.fit(train_doc2vec, train['topic'])

# predict the test data
pred = model.predict(test_doc2vec)

# calculate the accuracy
accuracy_score(test['topic'], pred)

0.27

In [223]:
# train accuracy vs. test accuracy
model.score(train_doc2vec, train['topic']), model.score(test_doc2vec, test['topic'])

(0.29125, 0.27)

In [225]:
# Lets use the model to predict the topic of a question
questions = [
    "I'm really interested in your special deals. Do you have any offers?",
    "I'm looking for a new car. What are your best deals?",
    "What kind of payment methods do you accept?",
    "I was wondering if you could ship to my country?"
]

# preprocess the question
questions = [preprocessing(question) for question in questions]

# get the doc2vec embeddings for the questions
questions_doc2vec = get_doc2vec_embeddings(d2v_model, pd.DataFrame({'question': questions}))

# for each question, predict the topic and print the result
for question, topic in zip(questions, model.predict(questions_doc2vec)):
    print('Question: {}'.format(' '.join(question)))
    print('Topic: {}'.format(topic) + '\n')

Question: realli interest special deal offer
Topic: Product Specifications

Question: look best deal
Topic: Product Specifications

Question: kind payment method accept
Topic: Product Specifications

Question: wonder could ship countri
Topic: Product Specifications

