In [1]:
from __future__ import division
import pandas as pd 
import numpy as np
import gensim
import boto3
import re
import json
import os
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
import string
from sklearn import linear_model, datasets
from sklearn.externals import joblib
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics import precision_recall_fscore_support
import nltk
nltk.download('wordnet')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df=pd.read_csv('rdc-catalog-train.tsv',delimiter='\t',encoding='utf-8',header=None)
df.columns = ['text', 'label']

#Isolate target data
X = df["text"].as_matrix().tolist()
X = np.hstack(X)
y = df["label"].as_matrix().tolist()
y = np.hstack(y)

In [3]:
def train_full(classifier, X, y):
    print("X:")
    print(len(X))
    print("y:")
    print(len(y))
    classifier.fit(X, y)
    return classifier


def train_test(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
    print("X_train:")
    print(len(X_train))
    print("X_test:")
    print(len(X_test))
    print("y_train:")
    print(len(y_train))
    print("y_test:")
    print(len(y_test))
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
    print "Accuracy: %s" % classifier.score(X_test, y_test)
    print "Precision: %s" % weighted_p
    print "Recall: %s" % weighted_r
    print "F1-Score: %s" % weighted_f1
    return classifier

In [4]:
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

def lemmatization_tokenizer(text):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(w) for w in word_tokenize(text)]

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def standardize_metrics(text):
    text = re.sub(r'\b[\d\.\/]+\s?(v|volt)\b', 'metricV', text)
    text = re.sub(r'\b[\d\.\/]+\s?(amp|amps|ampere|amperes)\b', 'metricA', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mah|ah|ampere-hour)\b', 'metricAh', text)
    text = re.sub(r'\b[\d\.\/]+\s?(in|inch|inches)\b', 'metricIn', text)
    text = re.sub(r'\b[\d\.\/]+\s?\"', 'metricIn', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gb|gig|go|giga|gigabit|gigabyte)\b', 'metricGb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(oz|ounce)\b', 'metricOz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fl\.? oz\.?|fluids? ounces?)\b', 'metricFlOz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cwt|quintal)\b', 'metricCwt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mhz|hz|khz|ghz|gigahertz|megahertz|kilohertz|hertz)\b', 'metricHz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(wh|kwh|watt-hour|kilowatt-hour)\b', 'metricWh', text)
    text = re.sub(r'\b[\d\.\/]+\s?(w|kw|watt|kilowatt)\b', 'metricW', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mf|mfd|mmf|mmfd|microfarad)\b', 'metricMfd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ft|feet|foot)\b', 'metricFt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cm|centimeter)\b', 'metricCm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mm|millimeter)\b', 'metricMm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(km|kilometer)\b', 'metricKm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(m|meter)\b', 'metricM', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cell|cells)\b', 'metricCell', text)
    text = re.sub(r'\b[\d\.\/]+\s?(lb|lbs|pound)\b', 'metricLb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yds|yd|yard|yards)\b', 'metricYd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pc|pcs|pieces|piece)\b', 'metricPc', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gal|gals|gallon|gallons)\b', 'metricGal', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yd|yds|yard|yards)\b', 'metricYd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(deg|degs|degree|degrees)\b', 'metricDeg', text)
    text = re.sub(r'\b[\d\.\/]+\s?\°', 'metricDeg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(l|liter|liters)\b', 'metricL', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ml|mls|milliliter|milliliters)\b', 'metricMl', text)
    text = re.sub(r'\b[\d\.\/]+\s?(kg|kilograms|kilogram)\b', 'metricKg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(g|grams|gram)\b', 'metricG', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mg|mgs|milligrams|milligram)\b', 'metricMg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(sq|sqs|square|squares)\b', 'metricSq', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pt|pts|pint|pints)\b', 'metricPt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ohm)\b', 'metricOhm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fz)\b', 'metricFz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ct|cts|carat|carats)\b', 'metricCt', text)
    text = re.sub(r'\b[\d]+p\b', 'metricRes', text)
    text = re.sub(r'\b[\d]+x[\d]+\b', 'metricRes', text)
    text = re.sub(r'\b[\d]+x\b', 'metricX', text)
    return text

def clean_text_standard_metrics_v0plus(text):
    text = text.lower()
    text = standardize_metrics(text)
    text = re.sub(r'\b\d*\.\d+\b', 'nbDec', text)
    text = re.sub(r'\b\d+\/\d+\b', 'nbFra', text)
    text = re.sub(r'\b\d+\b', 'nbNat', text)
#    text = re.sub(r'\b(\w*\d\w*[a-z]+\w*|\w*[a-z]+\w*\d\w*)\b', 'alphaNum', text)
    text = re.sub('\d+', '0', text) 
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def clean_text_standardisation(text):
    text = text.lower()
    text = re.sub(r'\bw\/o','without ',text)
    text = re.sub(r'\bw\/','with ',text)
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)pa?cks?\b', 'packUnit', text)
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)sets?\b', 'packUnit', text)
    text = re.sub(r'\bpa?cks?(\s|-|\sof\s|)\d+\b', 'packUnit', text)
    text = re.sub(r'\bsets?(\s|-|\sof\s|)\d+\b', 'packUnit', text)
    text = standardize_metrics(text)
#    text = re.sub(r'\b\d*\.\d+\b', 'nbDec', text)
#    text = re.sub(r'\b\d+\/\d+\b', 'nbFra', text)
#    text = re.sub(r'\b\d+\b', 'nbNat', text)
    text = re.sub('\d+', '0', text) 
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [6]:
#TEST 26: lemmatization + clean_text_standardisation preprocessor + sublinear + single letter + min_df 2
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,
                                stop_words=single_letter+list(string.punctuation))),
    ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_full(clf, X, y)
print("finished train")

start train...
X:
800000
y:
800000
[LibLinear]finished train


In [14]:
df=pd.read_csv('./rdc-catalog-test.tsv',delimiter='\t',encoding='utf-8')
df.head()



Unnamed: 0,Title,CategoryIdPath
0,Sterling Silver Dangle Ball Earrings w/ Brilli...,1608>2320>2173>3813
1,ALTERNATOR FREIGHTLINER FL FLC 112 120 FLD 112...,2199>4592>12
2,Disc Brake Rotor-Advanced Technology Rear Rayb...,2199>4592>12
3,Coquette Neon Pink Ruffle Babydoll 7035 Neon P...,1608>4269>3031>1221
4,12V 7Ah (SPS Brand) APC NS3000RMT3U Replacemen...,3292>114>1231


In [15]:
clf.predict(df.values[2])

array([u'2199>4592>12', u'2199>4592>12'], dtype='<U39')

In [18]:
df = df.drop(['CategoryIdPath'],axis=1)
df.values.shape

ValueError: labels ['CategoryIdPath'] not contained in axis

In [19]:
df.head()

Unnamed: 0,Title
0,Sterling Silver Dangle Ball Earrings w/ Brilli...
1,ALTERNATOR FREIGHTLINER FL FLC 112 120 FLD 112...
2,Disc Brake Rotor-Advanced Technology Rear Rayb...
3,Coquette Neon Pink Ruffle Babydoll 7035 Neon P...
4,12V 7Ah (SPS Brand) APC NS3000RMT3U Replacemen...


In [20]:
values=np.hstack(df.values)
predictions=clf.predict(values)
df.insert(1,'CategoryIdPath',[pred for pred in predictions])
df.head()

Unnamed: 0,Title,CategoryIdPath
0,Sterling Silver Dangle Ball Earrings w/ Brilli...,1608>2320>2173>2878
1,ALTERNATOR FREIGHTLINER FL FLC 112 120 FLD 112...,2199>4592>12
2,Disc Brake Rotor-Advanced Technology Rear Rayb...,2199>4592>12
3,Coquette Neon Pink Ruffle Babydoll 7035 Neon P...,1608>4269>3031>1221
4,12V 7Ah (SPS Brand) APC NS3000RMT3U Replacemen...,3292>114>1231


In [21]:
df.to_csv("submission_test_stdz2.tsv", sep='\t', encoding='utf-8',index=False,header=False)
sub_df=pd.read_csv("submission_test_stdz2.tsv",delimiter='\t',encoding='utf-8',header=None)
sub_df.head()

Unnamed: 0,0,1
0,Sterling Silver Dangle Ball Earrings w/ Brilli...,1608>2320>2173>2878
1,ALTERNATOR FREIGHTLINER FL FLC 112 120 FLD 112...,2199>4592>12
2,Disc Brake Rotor-Advanced Technology Rear Rayb...,2199>4592>12
3,Coquette Neon Pink Ruffle Babydoll 7035 Neon P...,1608>4269>3031>1221
4,12V 7Ah (SPS Brand) APC NS3000RMT3U Replacemen...,3292>114>1231


In [None]:
#TEST 26: lemmatization + clean_text_standardisation preprocessor + sublinear + single letter + min_df 2 + tri gram
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 3), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,
                                stop_words=single_letter+list(string.punctuation))),
    ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_test(clf, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]

In [None]:
#TEST 26: lemmatization + clean_text_standardisation preprocessor + sublinear + single letter + min_df 2 + quadri gram
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 4), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,
                                stop_words=single_letter+list(string.punctuation))),
    ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_test(clf, X, y)
print("finished train")

In [None]:
#TEST 26: lemmatization + clean_text_standardisation preprocessor + sublinear + single letter + min_df 2 + quadri gram
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 5), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,
                                stop_words=single_letter+list(string.punctuation))),
    ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_test(clf, X, y)
print("finished train")