In [305]:
import pandas as pd
import numpy as np

In [306]:
df = pd.read_csv('testset_C.csv', delimiter=';')

print(df.head())


         id     productgroup  \
0  26229701  WASHINGMACHINES   
1  16576864       USB MEMORY   
2  26155618       USB MEMORY   
3  25646138         BICYCLES   
4  19764614         BICYCLES   

                                           main_text  \
0                                          WAQ284E25   
1  LEEF IBRIDGE MOBILE SPEICHERERWEITERUNG FUER I...   
2                     SANDISK 32GB ULTRA FIT USB 3.0   
3  HOLLANDRAD DAMEN 28 ZOLL TUSSAUD 3-GAENGE RH 5...   
4                   DAHON SPEED D7 SCHWARZ ? FALTRAD   

                                   add_text         manufacturer  
0                            WASCHMASCHINEN                BOSCH  
1  PC__1100COMPUTINGMEMORY__1110MEMORYCARDS                 LEEF  
2                                     W1370                  NaN  
3             FAHRRAEDER // SPORTFAHRRAEDER  SCHALOW & KROH GMBH  
4          SPORTS__30000WHEELED__30070BIKES                DAHON  


In [307]:
print(df.productgroup.unique())


['WASHINGMACHINES' 'USB MEMORY' 'BICYCLES' 'CONTACT LENSES']


In [308]:
print(df.shape)
print(np.sum(df.isna()))

(8000, 5)
id                 0
productgroup       0
main_text          2
add_text           0
manufacturer    1344
dtype: int64


In [309]:
df.productgroup.value_counts(normalize=True)

WASHINGMACHINES    0.25
CONTACT LENSES     0.25
BICYCLES           0.25
USB MEMORY         0.25
Name: productgroup, dtype: float64

In [310]:
print(f'the shape of df after dropping the nan values is {df.dropna().shape}')
df.dropna().productgroup.value_counts(normalize=True)

the shape of df after dropping the nan values is (6655, 5)


CONTACT LENSES     0.265515
USB MEMORY         0.258753
WASHINGMACHINES    0.239820
BICYCLES           0.235913
Name: productgroup, dtype: float64

So eventhough the value count of productgroup after dropna is still acceptable, but since total number of nan in manufacturor is around 17% of the whole dataset, I prefer to keep them and simply fill NAN manufacturors with an empty string

In [311]:
df = df.fillna(value='')

In [397]:
df['text'] = df['main_text'] + ' ' + df['add_text'] + ' ' + df['manufacturer']

X = df[['text']]
Y = df['productgroup']

print(f'Y.shape is: {Y.shape}')
print(f'X.shape is: {X.shape}')
print(X.head())


Y.shape is: (8000,)
X.shape is: (8000, 1)
                                                text
0                     WAQ284E25 WASCHMASCHINEN BOSCH
1  LEEF IBRIDGE MOBILE SPEICHERERWEITERUNG FUER I...
2              SANDISK 32GB ULTRA FIT USB 3.0 W1370 
3  HOLLANDRAD DAMEN 28 ZOLL TUSSAUD 3-GAENGE RH 5...
4  DAHON SPEED D7 SCHWARZ ? FALTRAD SPORTS__30000...


In [398]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import preprocessing


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer



In [399]:
descriptive_features_pipeline = Pipeline(steps=
                                       [
                                           ('CountVectorizer', CountVectorizer()),
                                           ('Tfidf', TfidfTransformer())                                           
                                       ]
                                      )

preprocessing_pipeline = ColumnTransformer(transformers=
                                           [
                                               ('num',descriptive_features_pipeline, 'text')
                                           ]
                                          )

In [418]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline(steps=
                [
                    ('preprocessor', preprocessing_pipeline),
                    ('classifier', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None))
                    #('classifier', MultinomialNB())                    
                ]
               )


In [405]:
from sklearn import set_config
set_config(display='diagram')
# diplays HTML representation in a jupyter context
pipe

In [408]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2)#, stratify=Y)



In [412]:

pipe.fit(x_train, y_train)
print("model score: %.3f \n" % pipe.score(x_test, y_test))


model score: 0.998 



In [417]:
from sklearn import metrics

predicted = pipe.predict(x_test)
np.mean(predicted == y_test)

print(metrics.classification_report(y_test, predicted, target_names=df.productgroup.unique()))

                 precision    recall  f1-score   support

WASHINGMACHINES       1.00      1.00      1.00       409
     USB MEMORY       0.99      1.00      1.00       382
       BICYCLES       1.00      0.99      1.00       397
 CONTACT LENSES       1.00      1.00      1.00       412

       accuracy                           1.00      1600
      macro avg       1.00      1.00      1.00      1600
   weighted avg       1.00      1.00      1.00      1600



In [416]:
metrics.confusion_matrix(y_test, predicted)

array([[408,   1,   0,   0],
       [  0, 382,   0,   0],
       [  0,   2, 395,   0],
       [  0,   0,   0, 412]])