## Preprocessing and understanding the data

In [43]:
import pandas as pd
import re, string

In [44]:
data = pd.read_csv('../Data/data.csv')
data.head()

Unnamed: 0,question,topic
0,"Hi! If I sign up for your email list, can I se...",Sales/Promotions
1,I'm going to be out of the country for about a...,Shipping
2,I was wondering if you'd be able to overnight ...,Shipping
3,The Swingline electronic stapler (472555) look...,Shipping
4,I think this cosmetic bag would work great for...,Shipping


In [45]:
data.shape

(5000, 2)

In [46]:
# drop rows with missing values
data.dropna(inplace=True)

data.shape

(5000, 2)

In [47]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [48]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [49]:
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))

In [50]:
def preprocessing(text):
   tokens = [word for word in nltk.word_tokenize(text) if (len(word) > 3 and len(word.strip('Xx/')) > 2 and len(re.sub('\d+', '', word.strip('Xx/'))) > 3) ] 
   tokens = map(str.lower, tokens)
   stems = [stemmer.stem(item) for item in tokens if (item not in stop_words)]
   return stems

In [51]:
# data['question'] = data['question'].apply(preprocessing)

In [52]:
data.head()

Unnamed: 0,question,topic
0,"Hi! If I sign up for your email list, can I se...",Sales/Promotions
1,I'm going to be out of the country for about a...,Shipping
2,I was wondering if you'd be able to overnight ...,Shipping
3,The Swingline electronic stapler (472555) look...,Shipping
4,I think this cosmetic bag would work great for...,Shipping


In [53]:
import numpy as np
from sklearn.model_selection import train_test_split

In [54]:
# lets grab all unique values from topic and then use them as our target
target = data['topic'].unique()
target

array(['Sales/Promotions', 'Shipping', 'Product Availability',
       'Product Specifications', 'Omnichannel', 'Product Comparison',
       'Returns & Refunds'], dtype=object)

In [55]:
# split the data into train
train, test = train_test_split(data, test_size=0.2, random_state=42)

train_X = train.drop('topic', axis=1)

In [56]:
train.shape, test.shape

((4000, 2), (1000, 2))

In [57]:
# Lets vectorize the data
from sklearn.feature_extraction.text import TfidfVectorizer

In [61]:
# create the transform
vectorizer = TfidfVectorizer(tokenizer=preprocessing, ngram_range=(1, 2), stop_words=None, min_df=0.001, max_df=0.75, max_features=2500)

train_vectors = vectorizer.fit_transform(train_X['question'])

In [59]:
test_vector = vectorizer.transform(test['question'])

In [63]:
train_df = pd.DataFrame(train_vectors.toarray(), columns=vectorizer.get_feature_names())
train_df = pd.concat([train_df, train['topic']], axis=1)



## select (1) one multi-class classifier (e.g., Naive Bayes, Logistic, Decision Tree, SVM) whose code is provided in class handouts

In [64]:
# Lets use svm to train the model
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [65]:
# create the model, this is a multi-class classification problem
model = SVC(kernel='linear', C=1, gamma='auto', probability=True)

# fit the model
model.fit(train_vectors, train['topic'])

SVC(C=1, gamma='auto', kernel='linear', probability=True)

In [66]:
# predict the test data
pred = model.predict(test_vector)

# calculate the accuracy
accuracy_score(test['topic'], pred)

0.952

In [70]:
# Lets use the model to predict the topic of a question
question = "I'm really interested in your special deals. Do you have any offers?"

# preprocess the question
question = preprocessing(question)

# vectorize the question
question_vector = vectorizer.transform([' '.join(question)])

# predict the topic
model.predict(question_vector)

array(['Sales/Promotions'], dtype=object)

## (2) one ensemble classifier whose code is also provided (e.g., Random Forest, XGBoost)

In [71]:
# Lets use an ensemble classifier to improve the accuracy
from sklearn.ensemble import RandomForestClassifier

In [72]:
# create the model
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)

# fit the model
model.fit(train_vectors, train['topic'])

# predict the test data
pred = model.predict(test_vector)

# calculate the accuracy
accuracy_score(test['topic'], pred)

0.847

In [73]:
# Lets use the model to predict the topic of a question
question = "I'm really interested in your special deals. Do you have any offers?"

# preprocess the question
question = preprocessing(question)

# vectorize the question
question_vector = vectorizer.transform([' '.join(question)])

# predict the topic
model.predict(question_vector)

array(['Product Specifications'], dtype=object)

## (3) one other model of your choice whose code is NOT provided in class handouts (this will require some independent research on your part).

In [74]:
# Lets use MLP to train the model
from sklearn.neural_network import MLPClassifier

In [75]:
# create the model, this is a multi-class classification problem
model = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=500, alpha=0.0001, solver='sgd', verbose=10,  random_state=21, tol=0.000000001)

# fit the model
model.fit(train_vectors, train['topic'])

# predict the test data
pred = model.predict(test_vector)

# calculate the accuracy
accuracy_score(test['topic'], pred)

Iteration 1, loss = 1.93341939
Iteration 2, loss = 1.93116736
Iteration 3, loss = 1.92905598
Iteration 4, loss = 1.92700277
Iteration 5, loss = 1.92535041
Iteration 6, loss = 1.92386400
Iteration 7, loss = 1.92258336
Iteration 8, loss = 1.92146448
Iteration 9, loss = 1.92047054
Iteration 10, loss = 1.91960774
Iteration 11, loss = 1.91878748
Iteration 12, loss = 1.91812209
Iteration 13, loss = 1.91738677
Iteration 14, loss = 1.91674659
Iteration 15, loss = 1.91619460
Iteration 16, loss = 1.91559599
Iteration 17, loss = 1.91506802
Iteration 18, loss = 1.91451537
Iteration 19, loss = 1.91399562
Iteration 20, loss = 1.91348242
Iteration 21, loss = 1.91298342
Iteration 22, loss = 1.91247628
Iteration 23, loss = 1.91196029
Iteration 24, loss = 1.91145128
Iteration 25, loss = 1.91095379
Iteration 26, loss = 1.91044309
Iteration 27, loss = 1.90992209
Iteration 28, loss = 1.90940895
Iteration 29, loss = 1.90889486
Iteration 30, loss = 1.90839753
Iteration 31, loss = 1.90785283
Iteration 32, los



0.955

In [76]:
# Lets use the model to predict the topic of a question
question = "I'm really interested in your special deals. Do you have any offers?"

# preprocess the question
question = preprocessing(question)

# vectorize the question
question_vector = vectorizer.transform([' '.join(question)])

# predict the topic
model.predict(question_vector)

array(['Sales/Promotions'], dtype='<U22')

## For each classifier, use four kinds of input feature vectors: (1) TF-IDF vector of tokenized words, (2) TF-IDF vector of n-grams (of range 4-5), (3) word vectors (Glove, Word2Vec, or FastText), and (4) document vectors (Doc2Vec). Train each model using training data, report classification metrics using test data, and summarize the results of all model in a nicely formatted table. Comment on which model will be your preferred choice for text classification for this data and why.