In [1]:
import pandas as pd

# Preprocessing data

In [2]:
#Read Training Data
train = pd.read_csv('Dataset/train.csv')
test = pd.read_csv('Dataset/test.csv')

In [3]:
train.head()

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description,Product_Category
0,15001,VENDOR-1676,GL-6100410,83.24,Artworking/Typesetting Production Jun 2009 Cha...,CLASS-1963
1,15002,VENDOR-1883,GL-2182000,51.18,Auto Leasing Corporate Services Corning Inc /N...,CLASS-1250
2,15004,VENDOR-1999,GL-6050100,79.02,Store Management Lease/Rent Deltona Corp Real ...,CLASS-1274
3,15005,VENDOR-1771,GL-6101400,48.5,Store Construction General Requirements Coloni...,CLASS-1522
4,15006,VENDOR-1331,GL-2182000,63.35,Jul 2015 Aydin Corp Contingent Labor/Temp Labo...,CLASS-1376


In [4]:
#Data Size
len(train)

5566

In [5]:
out_Inv_Id = test['Inv_Id']

In [6]:
len(test)

2446

In [7]:
train.nunique()

Inv_Id              5566
Vendor_Code         1253
GL_Code                9
Inv_Amt             4258
Item_Description    5558
Product_Category      36
dtype: int64

In [8]:
#Check for missing values 
train.isna().sum()

Inv_Id              0
Vendor_Code         0
GL_Code             0
Inv_Amt             0
Item_Description    0
Product_Category    0
dtype: int64

In [9]:
#Increase the dimension of data and Convert to numeric data
def processData(data):
    data['Vendor_Code'] = data['Vendor_Code'].str.split("-", n = 0, expand = True)[1]
    data['GL_Code'] = data['GL_Code'].str.split("-", n = 0, expand = True)[1]
    data['Item_Description'] = data['Item_Description'].str.replace("/"," ")
    data.drop(columns =["Inv_Id"], inplace = True) 
    return data

In [10]:
target = train['Product_Category'].str.split("-", n = 0, expand = True)[1]
train.drop(columns =["Product_Category"], inplace = True)

In [11]:
train = processData(train)
test = processData(test)

In [12]:
train.head()

Unnamed: 0,Vendor_Code,GL_Code,Inv_Amt,Item_Description
0,1676,6100410,83.24,Artworking Typesetting Production Jun 2009 Cha...
1,1883,2182000,51.18,Auto Leasing Corporate Services Corning Inc N...
2,1999,6050100,79.02,Store Management Lease Rent Deltona Corp Real ...
3,1771,6101400,48.5,Store Construction General Requirements Coloni...
4,1331,2182000,63.35,Jul 2015 Aydin Corp Contingent Labor Temp Labo...


In [13]:
test.head()

Unnamed: 0,Vendor_Code,GL_Code,Inv_Amt,Item_Description
0,2513,6050310,56.13,Travel and Entertainment Miscellaneous Company...
1,1044,6101400,96.56,Final Site Clean Up Store Construction Advance...
2,1254,6101400,55.93,Arabian American Development Co Final Site Cle...
3,1331,2182000,32.62,Corporate Services Contingent Labor Temp Labor...
4,2513,6050310,25.81,Fortune National Corp Miscellaneous Company Ca...


In [14]:
train.nunique()

Vendor_Code         1253
GL_Code                9
Inv_Amt             4258
Item_Description    5558
dtype: int64

In [15]:
train.nunique()

Vendor_Code         1253
GL_Code                9
Inv_Amt             4258
Item_Description    5558
dtype: int64

In [16]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import HashingVectorizer
# from sklearn.preprocessing import StandardScaler

In [17]:
Vendor_Code_Encoder = LabelEncoder()
GL_Code_Encoder = LabelEncoder()
Product_Category_Encoder = LabelEncoder()
hashV = HashingVectorizer(n_features=150)

In [18]:
def dataEncoderfit(data):
    GL_Code_Encoder.fit(data['GL_Code'])
    Vendor_Code_Encoder.fit(data['Vendor_Code'])
    hashV.fit(data['Item_Description'])

In [19]:
def dataEncoder(data):
    data['GL_Code'] = GL_Code_Encoder.transform(data['GL_Code'])
    data['Vendor_Code'] = Vendor_Code_Encoder.transform(data['Vendor_Code'])
    ret = pd.DataFrame.from_records(hashV.transform(data['Item_Description']).toarray())
    data.drop(columns =["Item_Description"], inplace = True) 
    return data.join(ret)

In [20]:
total_data_for_encoding = train.append(test)
dataEncoderfit(total_data_for_encoding)

In [21]:
train = dataEncoder(train)

In [22]:
train.nunique()

Vendor_Code    1253
GL_Code           9
Inv_Amt        4258
0                47
1                31
2                36
3                47
4                34
5                20
6                52
7                33
8                26
9                42
10               31
11               32
12               24
13               23
14               43
15               36
16               16
17               25
18               58
19               30
20               31
21               18
22               26
23               27
24               43
25               43
26               28
               ... 
120              67
121              40
122              63
123              22
124               9
125              43
126              42
127              39
128              28
129              36
130              19
131              38
132              43
133              31
134              54
135              62
136              28
137              20
138              21


In [23]:
len(train)

5566

In [24]:
train.head()

Unnamed: 0,Vendor_Code,GL_Code,Inv_Amt,0,1,2,3,4,5,6,...,140,141,142,143,144,145,146,147,148,149
0,676,5,83.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,883,0,51.18,0.0,0.0,-0.408248,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.204124,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,999,2,79.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.27735,0.0,0.0,0.0,-0.27735,0.0
3,771,7,48.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.458831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,331,0,63.35,0.0,0.0,0.0,0.0,0.0,0.0,0.176777,...,0.0,0.0,0.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0


In [25]:
target.name = "Product_Category"
target.to_frame()

Unnamed: 0,Product_Category
0,1963
1,1250
2,1274
3,1522
4,1376
5,1522
6,1758
7,1522
8,1963
9,1274


In [26]:
target = Product_Category_Encoder.fit_transform(target)

In [27]:
import numpy as np
# np.unique(target,return_counts=True,axis=0)

In [28]:
num_class = len(np.unique(target))

In [29]:
target

array([25,  2,  3, ..., 14, 12, 15])

# Test Train Split among Train data

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.001,random_state=1)

# Model Building

In [31]:
import lightgbm as lgb

In [32]:
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)

In [33]:
#Hyperparameters

params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class':num_class,
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 20
}

In [34]:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                early_stopping_rounds=50,
                verbose_eval = True
               )

[1]	valid_0's multi_logloss: 1.55372
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's multi_logloss: 1.43961
[3]	valid_0's multi_logloss: 1.33997
[4]	valid_0's multi_logloss: 1.25136
[5]	valid_0's multi_logloss: 1.17081
[6]	valid_0's multi_logloss: 1.09802
[7]	valid_0's multi_logloss: 1.03126
[8]	valid_0's multi_logloss: 0.970092
[9]	valid_0's multi_logloss: 0.913296
[10]	valid_0's multi_logloss: 0.860763
[11]	valid_0's multi_logloss: 0.812275
[12]	valid_0's multi_logloss: 0.767065
[13]	valid_0's multi_logloss: 0.724674
[14]	valid_0's multi_logloss: 0.685177
[15]	valid_0's multi_logloss: 0.648274
[16]	valid_0's multi_logloss: 0.61361
[17]	valid_0's multi_logloss: 0.581202
[18]	valid_0's multi_logloss: 0.550575
[19]	valid_0's multi_logloss: 0.521805
[20]	valid_0's multi_logloss: 0.494614
[21]	valid_0's multi_logloss: 0.468965
[22]	valid_0's multi_logloss: 0.444826
[23]	valid_0's multi_logloss: 0.421965
[24]	valid_0's multi_logloss: 0.400349
[25]	valid_0's mult

[202]	valid_0's multi_logloss: 5.34059e-05
[203]	valid_0's multi_logloss: 5.08092e-05
[204]	valid_0's multi_logloss: 4.83523e-05
[205]	valid_0's multi_logloss: 4.6055e-05
[206]	valid_0's multi_logloss: 4.38116e-05
[207]	valid_0's multi_logloss: 4.16784e-05
[208]	valid_0's multi_logloss: 3.9651e-05
[209]	valid_0's multi_logloss: 3.77272e-05
[210]	valid_0's multi_logloss: 3.58977e-05
[211]	valid_0's multi_logloss: 3.41495e-05
[212]	valid_0's multi_logloss: 3.2497e-05
[213]	valid_0's multi_logloss: 3.09155e-05
[214]	valid_0's multi_logloss: 2.94251e-05
[215]	valid_0's multi_logloss: 2.79993e-05
[216]	valid_0's multi_logloss: 2.66511e-05
[217]	valid_0's multi_logloss: 2.54031e-05
[218]	valid_0's multi_logloss: 2.41814e-05
[219]	valid_0's multi_logloss: 2.3051e-05
[220]	valid_0's multi_logloss: 2.1942e-05
[221]	valid_0's multi_logloss: 2.09252e-05
[222]	valid_0's multi_logloss: 1.99595e-05
[223]	valid_0's multi_logloss: 1.90173e-05
[224]	valid_0's multi_logloss: 1.81165e-05
[225]	valid_0's 

In [35]:
prediction = gbm.predict(X_test)

In [36]:
y_pred = [np.argmax(pred) for pred in prediction]

In [37]:
y_pred == y_test

array([ True,  True,  True,  True,  True,  True])

In [38]:
from sklearn.metrics import accuracy_score
# from sklearn.metrics import confusion_matrix

In [39]:
print("Accuracy : "+str(accuracy_score(y_test, y_pred)))

Accuracy : 1.0


In [40]:
# confusion_matrix(y_test, y_pred)

# Test Data

In [41]:
test.head()

Unnamed: 0,Vendor_Code,GL_Code,Inv_Amt,Item_Description
0,2513,6050310,56.13,Travel and Entertainment Miscellaneous Company...
1,1044,6101400,96.56,Final Site Clean Up Store Construction Advance...
2,1254,6101400,55.93,Arabian American Development Co Final Site Cle...
3,1331,2182000,32.62,Corporate Services Contingent Labor Temp Labor...
4,2513,6050310,25.81,Fortune National Corp Miscellaneous Company Ca...


In [42]:
test.isna().sum()

Vendor_Code         0
GL_Code             0
Inv_Amt             0
Item_Description    0
dtype: int64

In [43]:
len(test)

2446

In [44]:
test = dataEncoder(test)

In [45]:
test.head()

Unnamed: 0,Vendor_Code,GL_Code,Inv_Amt,0,1,2,3,4,5,6,...,140,141,142,143,144,145,146,147,148,149
0,1513,3,56.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,44,7,96.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.229416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,254,7,55.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.235702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,331,0,32.62,0.0,0.0,0.0,0.0,0.0,0.0,0.176777,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1513,3,25.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.182574,0.0,0.0,0.0


In [46]:
actual_test_prediction = gbm.predict(test)

In [47]:
#probability of classification
actual_test_prediction

array([[2.12555766e-09, 1.47574536e-08, 2.33749818e-08, ...,
        1.80717485e-09, 2.80203574e-09, 6.30855081e-09],
       [3.76372959e-09, 2.19466136e-08, 3.47158131e-08, ...,
        1.57976046e-09, 4.07393616e-09, 9.39351357e-09],
       [2.90240840e-09, 2.06470660e-08, 3.26605081e-08, ...,
        1.63950828e-09, 3.85487350e-09, 8.83421785e-09],
       ...,
       [1.41124105e-08, 4.18501838e-08, 9.99999159e-01, ...,
        3.01232345e-09, 7.81627432e-09, 1.84602806e-08],
       [8.18596946e-09, 5.81245015e-08, 9.19479992e-08, ...,
        4.18309158e-09, 1.09109006e-08, 2.48708236e-08],
       [1.69105050e-08, 9.62055792e-08, 1.53948456e-07, ...,
        6.87325492e-09, 1.85740140e-08, 4.10497991e-08]])

In [48]:
encoded_actual_test_prediction = [np.argmax(pred) for pred in actual_test_prediction]

In [49]:
inverse_transformed_prediction = Product_Category_Encoder.inverse_transform(encoded_actual_test_prediction)

In [50]:
data = {'Inv_Id': out_Inv_Id,'Product_Category':inverse_transformed_prediction}
output = pd.DataFrame(data) 
output['Product_Category'] = 'CLASS-'+output['Product_Category'].astype(str)

In [51]:
output.to_csv(r'Prediction_Output.csv', index=False)