In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import time

from tqdm import tqdm
from string import punctuation
import seaborn as sns

import nltk
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import punkt
from nltk.corpus.reader import wordnet

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate

## Loading

In [5]:
data = pd.read_csv('train.csv')
data = data.iloc[:,1:len(data.columns)]
print(data.shape)
data.head()

(6744, 3)


Unnamed: 0,title,description,class
0,Samsung 32-inch Curved LED Monitor (Ultra- Sli...,1800R curved monitor with 3000:1 contrast rati...,3
1,HotHands Body & Hand Super Warmers - Long Last...,"Bring the Heat!,HotHands Body & Hand Super War...",2
2,"WePet Cat Litter Mat, Kitty Litter Trapping Ma...",,1
3,All-new Kindle Paperwhite Water-Safe Fabric Co...,,3
4,Street Fighter 30th Anniversary Collection - P...,Celebrate the 30th Anniversary of the iconic S...,0


## Preprocessing : cleaning & lemmatizing

In [6]:
lemma = WordNetLemmatizer()
stopW = stopwords.words('english')
exclude = set(punctuation)
stopW.extend(exclude)

def preprocess(sent):
    tokens = word_tokenize(sent.lower())
    tokens = [word for word in tokens if word not in stopW] #remove stop words and ponctuation
    tokens = [lemma.lemmatize(lemma.lemmatize(lemma.lemmatize(w, 'v'), 'a'), 'n') for w in tokens] #lemmatization
    return ' '.join(tokens)

In [7]:
data['description'] = data.description.fillna(' ')
data['title'] = data.title.fillna(' ')

data['description'] = data.description.apply(lambda sent : preprocess(sent))
data['title'] = data.title.apply(lambda sent : preprocess(sent))

data["full_text"] = data["title"] + data["description"]

data.head()

Unnamed: 0,title,description,class,full_text
0,samsung 32-inch curve lead monitor ultra- slim...,1800r curve monitor 3000:1 contrast ratio prov...,3,samsung 32-inch curve lead monitor ultra- slim...
1,hothands body hand super warmer long last safe...,bring heat hothands body hand super warmer sin...,2,hothands body hand super warmer long last safe...
2,wepet cat litter mat kitty litter trap mat hon...,,1,wepet cat litter mat kitty litter trap mat hon...
3,all-new kindle paperwhite water-safe fabric co...,,3,all-new kindle paperwhite water-safe fabric co...
4,street fighter 30th anniversary collection pla...,celebrate 30th anniversary iconic street fight...,0,street fighter 30th anniversary collection pla...


## Tf-idf

In [8]:
t1 = time.time()
vect = TfidfVectorizer(stop_words='english', analyzer='word')
tfidf_mat = vect.fit_transform(data.full_text)

feature_names = vect.get_feature_names()
dense = tfidf_mat.todense()
denselist = dense.tolist()
data2 = pd.DataFrame(denselist, columns = feature_names)

print('time = ' + str(np.round(time.time()-t1, 2)) + ' s.')
print(data2.shape)
data2.head()

time = 7399.95 s.
(6744, 30846)


Unnamed: 0,00,000,0000,000000,00000123,00001,00001sculpt,00003turn,000119,0004,...,①ps4,①state,①thanks,②3,②if,③if,ﬁghting,ﬁrst,ﬁsh,ﬂuid
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Training & testing

In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=50)

In [10]:
#Evaluating
cv_results = cross_validate(clf, data2, data['class'], cv=3, scoring='accuracy')
print(sorted(cv_results.keys()))
print('time = ' + str(np.round(cv_results['fit_time'] + cv_results['score_time'], 2)) + ' s.')
print('Accuracies : ' + str(np.round(cv_results['test_score'], 2)))

['fit_time', 'score_time', 'test_score']
time = [26.94 25.54 24.42] s.
Accuracies : [0.93 0.92 0.94]


## Kaggle prediction

In [17]:
#Loading test set
test_data = pd.read_csv('test.csv')
print(test_data.shape)
test_data.head()

(1686, 3)


Unnamed: 0,id,title,description
0,0,PDP 048-121-NA Afterglow Wired Controller for ...,Enhance your gaming experience with the Afterg...
1,1,Gold Medal Groomers Ear Powder (30 Grams),"Great for relieving irritated itchy ears, Groo..."
2,2,Sports Research Sweet Sweat Premium Waist Trim...,
3,3,Gildan Men's Fleece Open Bottom Pocketed Pant,Gildan is one of the world's largest verticall...
4,4,Sennheiser GAME ZERO Gaming Headset- Black,The new closed back G4ME ZERO headset is ideal...


In [18]:
#Preprocessing test set
test_data['description'] = test_data.description.fillna(' ')
test_data['title'] = test_data.title.fillna(' ')

test_data['description'] = test_data.description.apply(lambda sent : preprocess(sent))
test_data['title'] = test_data.title.apply(lambda sent : preprocess(sent))

test_data["full_text"] = test_data["title"] + test_data["description"]

test_data.head()

Unnamed: 0,id,title,description,full_text
0,0,pdp 048-121-na afterglow wire controller xbox ...,enhance game experience afterglow wire control...,pdp 048-121-na afterglow wire controller xbox ...
1,1,gold medal groomers ear powder 30 gram,great relieve irritate itchy ear groomers ear ...,gold medal groomers ear powder 30 gramgreat re...
2,2,sport research sweet sweat premium waist trim ...,,sport research sweet sweat premium waist trim ...
3,3,gildan men 's fleece open bottom pocket pant,gildan one world 's large vertically integrate...,gildan men 's fleece open bottom pocket pantgi...
4,4,sennheiser game zero game headset- black,new close back g4me zero headset ideal immersi...,sennheiser game zero game headset- blacknew cl...


In [19]:
#Using the tfidf transformation created with the training set
t1 = time.time()

test_tfidf_mat = vect.transform(test_data['full_text'])

test_dense = test_tfidf_mat.todense()
test_denselist = test_dense.tolist()
test_data2 = pd.DataFrame(test_denselist, columns = feature_names)

print('time = ' + str(np.round(time.time()-t1, 2)) + ' s.')
print(test_data2.shape)
test_data2.head()

time = 51.73 s.
(1686, 30846)


Unnamed: 0,00,000,0000,000000,00000123,00001,00001sculpt,00003turn,000119,0004,...,①ps4,①state,①thanks,②3,②if,③if,ﬁghting,ﬁrst,ﬁsh,ﬂuid
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
#Training a model on the full tfidf matrix of the training dataset and predicting on the test one
clf = RandomForestClassifier(n_estimators=50)
clf.fit(data2, data['class'])
y_pred2 = clf.predict(test_data2)

In [21]:
#Saving the results in the required format
prediction2 = pd.DataFrame({'class' : y_pred2})
prediction2.index.name = "id"
prediction2.to_csv('submissions/submission_1.csv', index=True)
prediction2.head()

Unnamed: 0_level_0,class
id,Unnamed: 1_level_1
0,0
1,2
2,2
3,2
4,0


We obtain a public score of 0.93950 in the Kaggle competition.  
Having 4 classes, a random classifier would score 0.25.