<b>This project is about deploying various predicting models to classify customer-saler questions into different categories<b>
    

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

In [3]:
#Load the dataset
df = pd.read_csv("question_topic.csv")

In [35]:
df.head(50)

Unnamed: 0.1,Unnamed: 0,question_text,question_topic
0,0,"Hi! If I sign up for your email list, can I se...",Sales/Promotions
1,1,I'm going to be out of the country for about a...,Shipping
2,2,I was wondering if you'd be able to overnight ...,Shipping
3,3,The Swingline electronic stapler (472555) look...,Shipping
4,4,I think this cosmetic bag would work great for...,Shipping
5,5,I'm going to be out of the state for about a w...,Shipping
6,6,I'm going to be out of the state for about a w...,Shipping
7,7,The Stanley Bostitch electronic stapler (63460...,Shipping
8,8,When is the estimated delivery date if I was t...,Shipping
9,9,I was wondering if you'd be able to overnight ...,Shipping


In [5]:
df.shape

(5000, 3)

In [6]:
%timeit set(df["question_topic"])

158 µs ± 907 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [7]:
%timeit df["question_topic"].unique()

151 µs ± 421 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [8]:
%timeit df["question_topic"].value_counts()

162 µs ± 825 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [9]:
from collections import Counter
%timeit Counter(df["question_topic"])

262 µs ± 4.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
#pre-processing
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [11]:
df.columns


Index(['Unnamed: 0', 'question_text', 'question_topic'], dtype='object')

In [12]:
#train test split
from sklearn.model_selection import train_test_split
X = []
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df["question_topic"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

In [13]:
#feature engineering and model selection
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [14]:
#pipeline of feature engineering and model
model = Pipeline([('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [15]:
#paramater selection
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

In [16]:
gs_clf_svm = GridSearchCV(model, parameters, cv=5,n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

0.9683999999999999
{'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}


In [17]:
#preparing the final pipeline using the selected parameters
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [18]:
#fit model with training data
model.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 OneVsRestClassifier(estimator=LinearSVC(class_weight='balanced')))])

In [19]:
#evaluation on test data
pred = model.predict(X_test)

In [20]:
model.classes_

array(['Omnichannel', 'Product Availability', 'Product Comparison',
       'Product Specifications', 'Returns & Refunds', 'Sales/Promotions',
       'Shipping'], dtype='<U22')

In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

array([[128,   0,   0,   0,   0,   0,   0],
       [  0, 252,   0,   5,   0,   5,   0],
       [  0,   0, 223,   2,   0,   0,   0],
       [  0,   1,   6, 254,   0,   1,   0],
       [  0,   0,   0,   0, 230,   1,   0],
       [  0,   0,   0,   0,   0, 146,   0],
       [  2,   0,   0,   0,   0,   0, 244]])

In [22]:
accuracy_score(y_test, pred)

0.9846666666666667

In [23]:
#save the model
import joblib
joblib.dump(model, 'model_question_topic.pkl', compress=1)

['model_question_topic.pkl']

# Deployment

In [24]:
import joblib
model = joblib.load('model_question_topic.pkl')

In [25]:
question = "I am searching for an iphone"

In [26]:
model.predict([question])[0]

'Product Availability'

In [33]:
question = "When does my camera going to arrive"

In [34]:
model.predict([question])[0]

'Shipping'