### Here is the predictive model to distinguish between the main product categories in the dataset provided by the Otto Group company.
### Provided dataset has 93 features and more than 2000,000 products.
### For reducing high dimensionality, Linear Discriminant Analysis, a method of Dimensionality Reduction is used.
### This model is built on the XG Boost Classifier algorithm, a popular and efficient algorithm which attempts to predict the target variable accurately.



#### Importing Libraries...

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#### Loading data...

In [3]:
data = pd.read_csv('/kaggle/input/otto-group-product-classification-challenge/train.csv')
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [4]:
data.columns

Index(['id', 'feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_6',
       'feat_7', 'feat_8', 'feat_9', 'feat_10', 'feat_11', 'feat_12',
       'feat_13', 'feat_14', 'feat_15', 'feat_16', 'feat_17', 'feat_18',
       'feat_19', 'feat_20', 'feat_21', 'feat_22', 'feat_23', 'feat_24',
       'feat_25', 'feat_26', 'feat_27', 'feat_28', 'feat_29', 'feat_30',
       'feat_31', 'feat_32', 'feat_33', 'feat_34', 'feat_35', 'feat_36',
       'feat_37', 'feat_38', 'feat_39', 'feat_40', 'feat_41', 'feat_42',
       'feat_43', 'feat_44', 'feat_45', 'feat_46', 'feat_47', 'feat_48',
       'feat_49', 'feat_50', 'feat_51', 'feat_52', 'feat_53', 'feat_54',
       'feat_55', 'feat_56', 'feat_57', 'feat_58', 'feat_59', 'feat_60',
       'feat_61', 'feat_62', 'feat_63', 'feat_64', 'feat_65', 'feat_66',
       'feat_67', 'feat_68', 'feat_69', 'feat_70', 'feat_71', 'feat_72',
       'feat_73', 'feat_74', 'feat_75', 'feat_76', 'feat_77', 'feat_78',
       'feat_79', 'feat_80', 'feat_81', 'feat_82', 'fe

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61878 entries, 0 to 61877
Data columns (total 95 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       61878 non-null  int64 
 1   feat_1   61878 non-null  int64 
 2   feat_2   61878 non-null  int64 
 3   feat_3   61878 non-null  int64 
 4   feat_4   61878 non-null  int64 
 5   feat_5   61878 non-null  int64 
 6   feat_6   61878 non-null  int64 
 7   feat_7   61878 non-null  int64 
 8   feat_8   61878 non-null  int64 
 9   feat_9   61878 non-null  int64 
 10  feat_10  61878 non-null  int64 
 11  feat_11  61878 non-null  int64 
 12  feat_12  61878 non-null  int64 
 13  feat_13  61878 non-null  int64 
 14  feat_14  61878 non-null  int64 
 15  feat_15  61878 non-null  int64 
 16  feat_16  61878 non-null  int64 
 17  feat_17  61878 non-null  int64 
 18  feat_18  61878 non-null  int64 
 19  feat_19  61878 non-null  int64 
 20  feat_20  61878 non-null  int64 
 21  feat_21  61878 non-null  int64 
 22

In [6]:
nulldata = data.isnull().sum()
print(nulldata[nulldata>0])

Series([], dtype: int64)


In [7]:
data.target.value_counts()

Class_2    16122
Class_6    14135
Class_8     8464
Class_3     8004
Class_9     4955
Class_7     2839
Class_5     2739
Class_4     2691
Class_1     1929
Name: target, dtype: int64

#### Separating dependent and independent variables

In [8]:
x = data.drop('target',axis=1)
y = data[['target']]

#### Splitting the training data and validation data

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_valid,y_train,y_valid = train_test_split(x,y,test_size=0.2)

#### Standardization 

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_valid = sc.transform(x_valid)

In [11]:
x_train.shape

(49502, 94)

In [12]:
data.shape

(61878, 95)

#### Dimensionality Reduction

In [13]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
x_train = lda.fit_transform(x_train,y_train)
x_valid = lda.transform(x_valid)

  y = column_or_1d(y, warn=True)


In [14]:
x_train.shape

(49502, 8)

#### Sorting Index

In [15]:
a = pd.DataFrame(y_train.value_counts())
b = a.sort_index(axis=0)
b.index

MultiIndex([('Class_1',),
            ('Class_2',),
            ('Class_3',),
            ('Class_4',),
            ('Class_5',),
            ('Class_6',),
            ('Class_7',),
            ('Class_8',),
            ('Class_9',)],
           names=['target'])

#### One Hot Encoding of the dependent variable

In [16]:
from sklearn.preprocessing import OneHotEncoder
OHE = OneHotEncoder(handle_unknown='ignore',sparse=False)
y_train = pd.DataFrame(OHE.fit_transform(y_train),columns = b.index)
y_valid = pd.DataFrame(OHE.transform(y_valid),columns = b.index)

In [17]:
y_valid.head()

target,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### XG Boost Classifier

In [18]:
from xgboost import XGBClassifier

In [19]:
my_model = XGBClassifier()
my_model.fit(x_train,y_train)
y_pred = my_model.predict(x_valid)

#### Verifying Metrics

In [20]:
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.metrics import f1_score,precision_score,recall_score

In [21]:
print('Accuracy : ',accuracy_score(y_valid,y_pred))
print('\nclassification report : \n',classification_report(y_valid,y_pred))
print('\nf1 score : \n',f1_score(y_valid,y_pred,average=None))
print('\nprecision_score : \n',precision_score(y_valid,y_pred,average=None))
print('\nrecall_score : \n',recall_score(y_valid,y_pred,average=None))


Accuracy :  0.989414996767938

classification report : 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       388
           1       1.00      0.99      0.99      3152
           2       0.98      0.99      0.98      1647
           3       0.97      0.97      0.97       541
           4       1.00      0.99      1.00       539
           5       1.00      0.99      1.00      2775
           6       0.98      0.98      0.98       603
           7       0.99      1.00      0.99      1685
           8       1.00      0.99      1.00      1046

   micro avg       0.99      0.99      0.99     12376
   macro avg       0.99      0.99      0.99     12376
weighted avg       0.99      0.99      0.99     12376
 samples avg       0.99      0.99      0.99     12376


f1 score : 
 [0.99485861 0.99491902 0.98427102 0.96952909 0.99628253 0.99602888
 0.97840532 0.99318922 0.99520614]

precision_score : 
 [0.99230769 0.99586777 0.98071127 0.96863469 0.

  _warn_prf(average, modifier, msg_start, len(result))


#### Implementing the model to the test data 

In [22]:
x_test = pd.read_csv('/kaggle/input/otto-group-product-classification-challenge/test.csv')
x_test = sc.transform(x_test)
x_test = lda.transform(x_test)

In [23]:
x_test = pd.DataFrame(x_test)
l = list(x_test.index)
indexlist = pd.DataFrame(l,columns = ['id'])

In [24]:
ind = ['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']

#### Predicting the test data

In [25]:
predictions = pd.DataFrame(my_model.predict(x_test),columns = ind)

In [26]:
predictions.head()

Unnamed: 0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
sample = pd.read_csv('/kaggle/input/otto-group-product-classification-challenge/sampleSubmission.csv')
sample.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,1,1,0,0,0,0,0,0,0,0
1,2,1,0,0,0,0,0,0,0,0
2,3,1,0,0,0,0,0,0,0,0
3,4,1,0,0,0,0,0,0,0,0
4,5,1,0,0,0,0,0,0,0,0


In [28]:
x_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,-9.580837,-0.467278,-0.992099,0.133397,-1.22161,0.324263,1.67123,2.902215
1,-8.847654,1.354975,-0.165058,1.131874,2.524327,-0.472635,-1.096662,-0.771124
2,-8.556826,4.511615,1.041851,0.749542,1.865802,-0.234163,-0.401307,-1.031381
3,-9.510612,-0.807562,-1.101659,-0.975898,0.459598,0.723522,-1.31778,1.453158
4,-8.72017,-0.265293,-0.023465,-0.0395,3.749187,-1.060294,0.109233,0.482871


In [29]:

output = pd.concat((indexlist,predictions),axis=1)
output.to_csv('submission.csv',index=False,header=1)