In [1]:
import pandas as pd

# Preprocessing data

In [2]:
#Read Training Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description,Product_Category
0,1,VENDOR-61,GL-6050100,6.973473,AETNA VARIABLE FUND - Apr-2002 - Store Managem...,CLASS-784
1,2,VENDOR-61,GL-6050100,25.053841,AETNA VARIABLE FUND - Nov-2000 - Store Managem...,CLASS-784
2,3,VENDOR-449,GL-6050100,53.573737,FAIRCHILD CORP - Nov-2001 - Store Management R...,CLASS-784
3,4,VENDOR-682,GL-6050100,67.388827,CALIFORNIA REAL ESTATE INVESTMENT TRUST - Aug-...,CLASS-784
4,5,VENDOR-682,GL-6050100,74.262047,CALIFORNIA REAL ESTATE INVESTMENT TRUST - Mar-...,CLASS-784


In [4]:
#Data Size
len(train)

5719

In [5]:
len(test)

2292

In [6]:
train.nunique()

Inv_Id              5719
Vendor_Code         1313
GL_Code                9
Inv_Amt             5719
Item_Description    5118
Product_Category      38
dtype: int64

In [7]:
#Check for missing values 
train.isna().sum()

Inv_Id              0
Vendor_Code         0
GL_Code             0
Inv_Amt             0
Item_Description    0
Product_Category    0
dtype: int64

In [8]:
#Increase the dimension of data and Convert to numeric data
def processData(data):
    data['Vendor_Code'] = data['Vendor_Code'].str.split("-", n = 0, expand = True)[1]
    data['GL_Code'] = data['GL_Code'].str.split("-", n = 0, expand = True)[1]
    new = data["Item_Description"].str.split(" - ", n = 2, expand = True)
    data["Company"]= new[0]
#     data["Month"]= new[1] 
    data["Description"]= new[2]
    month_Year = new[1].str.split("-", n = 1, expand = True)
    data["Month"] = month_Year[0]
#     data["Year"] = month_Year[1]
    data.drop(columns =["Item_Description","Inv_Id"], inplace = True) 
    return data

In [9]:
train = processData(train)
test = processData(test)

In [10]:
target = train['Product_Category'].str.split("-", n = 0, expand = True)[1]
train.drop(columns =["Product_Category"], inplace = True)

In [11]:
train.head()

Unnamed: 0,Vendor_Code,GL_Code,Inv_Amt,Company,Description,Month
0,61,6050100,6.973473,AETNA VARIABLE FUND,Store Management Real Estate Real Estate Servi...,Apr
1,61,6050100,25.053841,AETNA VARIABLE FUND,Store Management Real Estate Real Estate Servi...,Nov
2,449,6050100,53.573737,FAIRCHILD CORP,Store Management Real Estate Real Estate Servi...,Nov
3,682,6050100,67.388827,CALIFORNIA REAL ESTATE INVESTMENT TRUST,Store Management Real Estate Real Estate Servi...,Aug
4,682,6050100,74.262047,CALIFORNIA REAL ESTATE INVESTMENT TRUST,Store Management Real Estate Real Estate Servi...,Mar


In [12]:
test.head()

Unnamed: 0,Vendor_Code,GL_Code,Inv_Amt,Company,Description,Month
0,1197,6050100,10.916343,DESOTO INC,Store Management Real Estate Real Estate Servi...,Jul
1,792,6050100,38.658772,CENTURY REALTY TRUST,Store Management Real Estate Real Estate Servi...,Nov
2,792,6050100,46.780476,CENTURY REALTY TRUST,Store Management Real Estate Real Estate Servi...,Jan
3,792,6050100,7.058866,CENTURY REALTY TRUST,Store Management Real Estate Real Estate Servi...,Sep
4,792,6050100,32.931765,CENTURY REALTY TRUST,Store Management Real Estate Real Estate Servi...,Nov


In [13]:
train.nunique()

Vendor_Code    1313
GL_Code           9
Inv_Amt        5719
Company        1341
Description      38
Month            12
dtype: int64

In [14]:
from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import StandardScaler

In [15]:
Vendor_Code_Encoder = LabelEncoder()
GL_Code_Encoder = LabelEncoder()
Company_Encoder = LabelEncoder()
Month_Encoder = LabelEncoder()
year_Encoder = LabelEncoder()
Description_Encoder = LabelEncoder()
Product_Category_Encoder = LabelEncoder()
# Inv_Amt_Scalar = StandardScaler()

In [16]:
def dataEncoderfit(data):
    GL_Code_Encoder.fit(data['GL_Code'])
    Vendor_Code_Encoder.fit(data['Vendor_Code'])
    Company_Encoder.fit(data['Company'])
    Month_Encoder.fit(data['Month'])
#     year_Encoder.fit_transform(data['Year'])
    Description_Encoder.fit(data['Description'])
#     Inv_Amt_Scalar.fit(data['Inv_Amt'])

In [17]:
def dataEncoder(data):
    data['GL_Code'] = GL_Code_Encoder.transform(data['GL_Code'])
    data['Vendor_Code'] = Vendor_Code_Encoder.transform(data['Vendor_Code'])
    data['Company'] = Company_Encoder.transform(data['Company'])
    data['Month'] = Month_Encoder.transform(data['Month'])
#     data['Year'] = year_Encoder.fit_transform(data['Year'])
    data['Description'] = Description_Encoder.transform(data['Description'])
#     data['Inv_Amt'] = Inv_Amt_Scalar.transform(data['Inv_Amt'])
    return data

In [18]:
total_data_for_encoding = train.append(test)
dataEncoderfit(total_data_for_encoding)

In [19]:
train = dataEncoder(train)

In [20]:
len(train)

5719

In [21]:
train.head()

Unnamed: 0,Vendor_Code,GL_Code,Inv_Amt,Company,Description,Month
0,1128,2,6.973473,63,34,0
1,1128,2,25.053841,63,34,9
2,949,2,53.573737,1367,34,9
3,1208,2,67.388827,659,34,1
4,1208,2,74.262047,659,34,7


In [22]:
target.name = "Product_Category"
target.to_frame()

Unnamed: 0,Product_Category
0,784
1,784
2,784
3,784
4,784
5,784
6,784
7,784
8,784
9,784


In [23]:
target = Product_Category_Encoder.fit_transform(target)

In [24]:
import numpy as np
np.unique(target,return_counts=True,axis=0)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37]),
 array([  34,   29,   26,  370,   34,   50,  107,  773,   79,   15,    4,
           7,  196,  464,  117, 1521,   22,   13,  115,   68,   13,   42,
          73,   42,  985,    3,  219,   27,    2,   19,   38,    2,   13,
          53,  107,    2,   27,    8], dtype=int64))

In [25]:
train.head()

Unnamed: 0,Vendor_Code,GL_Code,Inv_Amt,Company,Description,Month
0,1128,2,6.973473,63,34,0
1,1128,2,25.053841,63,34,9
2,949,2,53.573737,1367,34,9
3,1208,2,67.388827,659,34,1
4,1208,2,74.262047,659,34,7


In [26]:
train.nunique()

Vendor_Code    1313
GL_Code           9
Inv_Amt        5719
Company        1341
Description      38
Month            12
dtype: int64

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.1,random_state=1)

In [28]:
X_train.head()

Unnamed: 0,Vendor_Code,GL_Code,Inv_Amt,Company,Description,Month
2921,1406,3,13.877057,821,37,3
457,707,4,45.113914,234,16,11
1064,1506,7,4.991837,907,31,5
705,743,0,64.035118,264,8,1
3282,456,3,61.749586,1313,37,10


# Model Building

In [29]:
import lightgbm as lgb

In [30]:
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)

In [31]:
#Hyperparameters

params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class':38,
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 20
}

In [32]:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                early_stopping_rounds=50,
                verbose_eval = True
               )

[1]	valid_0's multi_logloss: 2.00785
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's multi_logloss: 1.8206
[3]	valid_0's multi_logloss: 1.66209
[4]	valid_0's multi_logloss: 1.53919
[5]	valid_0's multi_logloss: 1.43143
[6]	valid_0's multi_logloss: 1.3301
[7]	valid_0's multi_logloss: 1.2474
[8]	valid_0's multi_logloss: 1.17046
[9]	valid_0's multi_logloss: 1.10118
[10]	valid_0's multi_logloss: 1.0388
[11]	valid_0's multi_logloss: 0.979529
[12]	valid_0's multi_logloss: 0.925241
[13]	valid_0's multi_logloss: 0.875461
[14]	valid_0's multi_logloss: 0.828313
[15]	valid_0's multi_logloss: 0.784584
[16]	valid_0's multi_logloss: 0.743762
[17]	valid_0's multi_logloss: 0.705334
[18]	valid_0's multi_logloss: 0.670132
[19]	valid_0's multi_logloss: 0.637136
[20]	valid_0's multi_logloss: 0.605337
[21]	valid_0's multi_logloss: 0.575448
[22]	valid_0's multi_logloss: 0.547233
[23]	valid_0's multi_logloss: 0.5212
[24]	valid_0's multi_logloss: 0.49608
[25]	valid_0's multi_logloss

[203]	valid_0's multi_logloss: 0.00176224
[204]	valid_0's multi_logloss: 0.00176053
[205]	valid_0's multi_logloss: 0.00176848
[206]	valid_0's multi_logloss: 0.00180599
[207]	valid_0's multi_logloss: 0.00178197
[208]	valid_0's multi_logloss: 0.00177943
[209]	valid_0's multi_logloss: 0.00177411
[210]	valid_0's multi_logloss: 0.00181646
[211]	valid_0's multi_logloss: 0.00175909
[212]	valid_0's multi_logloss: 0.00170256
[213]	valid_0's multi_logloss: 0.00165093
[214]	valid_0's multi_logloss: 0.00163345
[215]	valid_0's multi_logloss: 0.00158262
[216]	valid_0's multi_logloss: 0.00160346
[217]	valid_0's multi_logloss: 0.00159971
[218]	valid_0's multi_logloss: 0.00160072
[219]	valid_0's multi_logloss: 0.00160536
[220]	valid_0's multi_logloss: 0.00159361
[221]	valid_0's multi_logloss: 0.001595
[222]	valid_0's multi_logloss: 0.00160041
[223]	valid_0's multi_logloss: 0.00160811
[224]	valid_0's multi_logloss: 0.0016097
[225]	valid_0's multi_logloss: 0.00158497
[226]	valid_0's multi_logloss: 0.0016

In [33]:
prediction = gbm.predict(X_test)

In [34]:
y_pred = [np.argmax(pred) for pred in prediction]

In [35]:
y_pred == y_test

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [36]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [37]:
print("Accuracy : "+str(accuracy_score(y_test, y_pred)))

Accuracy : 1.0


In [38]:
# confusion_matrix(y_test, y_pred)

In [39]:
# gbm.save_model('model.txt')

# Test Data

In [40]:
test.head()

Unnamed: 0,Vendor_Code,GL_Code,Inv_Amt,Company,Description,Month
0,1197,6050100,10.916343,DESOTO INC,Store Management Real Estate Real Estate Servi...,Jul
1,792,6050100,38.658772,CENTURY REALTY TRUST,Store Management Real Estate Real Estate Servi...,Nov
2,792,6050100,46.780476,CENTURY REALTY TRUST,Store Management Real Estate Real Estate Servi...,Jan
3,792,6050100,7.058866,CENTURY REALTY TRUST,Store Management Real Estate Real Estate Servi...,Sep
4,792,6050100,32.931765,CENTURY REALTY TRUST,Store Management Real Estate Real Estate Servi...,Nov


In [41]:
test.isna().sum()

Vendor_Code    0
GL_Code        0
Inv_Amt        0
Company        0
Description    0
Month          0
dtype: int64

In [42]:
test = dataEncoder(test)

In [43]:
len(test)

2292

In [44]:
test.head()

Unnamed: 0,Vendor_Code,GL_Code,Inv_Amt,Company,Description,Month
0,220,2,10.916343,1131,34,5
1,1330,2,38.658772,766,34,9
2,1330,2,46.780476,766,34,4
3,1330,2,7.058866,766,34,11
4,1330,2,32.931765,766,34,9


In [45]:
actual_test_prediction = gbm.predict(test)

In [46]:
#probability of classification
actual_test_prediction

array([[1.25698374e-04, 8.65792294e-05, 9.11748177e-05, ...,
        4.00006114e-06, 8.71152563e-05, 2.77420237e-05],
       [1.58170600e-05, 1.20311291e-05, 1.23458922e-05, ...,
        6.97841323e-07, 1.36702547e-05, 3.80013578e-06],
       [1.10631031e-05, 9.22741837e-06, 9.46888818e-06, ...,
        1.08144211e-06, 1.00228256e-05, 2.87874771e-06],
       ...,
       [3.03789250e-07, 2.84860474e-07, 2.63277364e-07, ...,
        1.47043428e-08, 2.76520478e-07, 1.18958137e-07],
       [1.10958413e-05, 6.43723390e-06, 5.98294826e-06, ...,
        3.38250897e-07, 9.04758087e-06, 4.67266557e-06],
       [3.37891756e-06, 3.11488525e-06, 2.98135167e-06, ...,
        2.58260155e-07, 3.27902317e-06, 1.49990751e-06]])

In [47]:
encoded_actual_test_prediction = [np.argmax(pred) for pred in actual_test_prediction]

In [48]:
inverse_transformed_prediction = Product_Category_Encoder.inverse_transform(encoded_actual_test_prediction)

In [49]:
data = {'Inv_Id': range(1 , len(inverse_transformed_prediction)+1),'Product_Category':inverse_transformed_prediction}

output = pd.DataFrame(data) 
output['Product_Category'] = 'CLASS-'+output['Product_Category'].astype(str)

In [50]:
output.to_csv(r'Prediction_Output.csv', index=False)