# Amazon ML Hackathon - Team ZEAL 

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder as lb
from sklearn.svm import LinearSVC
import pandas as pd
from sklearn.metrics import accuracy_score

We are using "Train.csv" file which contains below mentioned features, rows & columns. 

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.dropna()
print(df.head())
print(df.shape)

   PRODUCT_ID                                              TITLE  \
0     1925202  ArtzFolio Tulip Flowers Blackout Curtain for D...   
1     2673191  Marks & Spencer Girls' Pyjama Sets T86_2561C_N...   
2     2765088  PRIKNIK Horn Red Electric Air Horn Compressor ...   
3     1594019  ALISHAH Women's Cotton Ankle Length Leggings C...   
4      283658  The United Empire Loyalists: A Chronicle of th...   

                                       BULLET_POINTS  \
0  [LUXURIOUS & APPEALING: Beautiful custom-made ...   
1  [Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...   
2  [Loud Dual Tone Trumpet Horn, Compatible With ...   
3  [Made By 95%cotton and 5% Lycra which gives yo...   
4                                                NaN   

                                         DESCRIPTION  PRODUCT_TYPE_ID  \
0                                                NaN             1650   
1                                                NaN             2755   
2  Specifications: Color: Red, Mate

from above, we can observe that train.csv contains 22 Lakh Rows and 6 Columns

In [4]:
df.describe()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH
count,2249698.0,2249698.0,2249698.0
mean,1499795.0,4000.456,4071.839
std,866194.4,3966.146,1351685.0
min,1.0,0.0,1.0
25%,749479.5,230.0,511.811
50%,1499558.0,2916.0,663.0
75%,2250664.0,6403.0,1062.992
max,2999999.0,13420.0,1885801000.0


In [5]:
df.corr()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH
PRODUCT_ID,1.0,0.250576,0.000424
PRODUCT_TYPE_ID,0.250576,1.0,0.000961
PRODUCT_LENGTH,0.000424,0.000961,1.0


### It shows that Pearson Correlation of the dataset is very Low with other entities.

In [6]:
from sklearn.preprocessing import LabelEncoder as le
def prep_data(td):
    new_td = td.drop_duplicates()
    new_td['PRODUCT_ID'] = new_td.index + 1
    new_td.set_index('PRODUCT_ID', inplace=True)
    new_td['DATA'] = new_td['TITLE'].astype(str)+" "+new_td['DESCRIPTION'].astype(str)+" "+new_td['BULLET_POINTS'].astype(str)
    new_td['PRODUCT_LENGTH'] = new_td['PRODUCT_LENGTH'].astype(int)
    new_td['CATEGORY'] = new_td['PRODUCT_LENGTH'].astype('category')
    return new_td

We are using Label Encoder to label all the Continous data to Categorical Data & the Function Prep_Data is basically removing all the duplicates from the data, as well as combining the important strings present in the Title, Description and Bulletpoints into a new column called "Data" 


## Data Preprocessing

In [7]:
new_df = prep_data(df)
new_df = new_df.dropna()
print(new_df.head())
print(new_df.shape)

                                                        TITLE  \
PRODUCT_ID                                                      
3           PRIKNIK Horn Red Electric Air Horn Compressor ...   
4           ALISHAH Women's Cotton Ankle Length Leggings C...   
6           HINS Metal Bucket Shape Plant Pot for Indoor &...   
8           Delavala Self Adhesive Kitchen Backsplash Wall...   
10          Hexwell Essential oil for Home Fragrance Oil A...   

                                                BULLET_POINTS  \
PRODUCT_ID                                                      
3           [Loud Dual Tone Trumpet Horn, Compatible With ...   
4           [Made By 95%cotton and 5% Lycra which gives yo...   
6           [Simple and elegant, great for displaying indo...   
8           [HIGH QUALITY PVC MATERIAL: The kitchen alumin...   
10          [100% Pure And Natural Essential Oil Or Fragra...   

                                                  DESCRIPTION  \
PRODUCT_ID             

We can observe that after removal of duplication, we posses only 10 Lakh lines of data

In [8]:
X = new_df['DATA']
Y = new_df['CATEGORY']
print(X.head(), X.shape, Y.head(), Y.shape, end="\n")

PRODUCT_ID
3     PRIKNIK Horn Red Electric Air Horn Compressor ...
4     ALISHAH Women's Cotton Ankle Length Leggings C...
6     HINS Metal Bucket Shape Plant Pot for Indoor &...
8     Delavala Self Adhesive Kitchen Backsplash Wall...
10    Hexwell Essential oil for Home Fragrance Oil A...
Name: DATA, dtype: object (1038460,) PRODUCT_ID
3     748
4     787
6     950
8     984
10    393
Name: CATEGORY, dtype: category
Categories (7303, int64): [1, 2, 3, 4, ..., 224928000, 393701000, 480315220, 1885801400] (1038460,)


In [9]:
X_train, X_test,Y_train, Y_test = train_test_split(X,Y, train_size = 0.05,test_size=0.01, random_state = 42)
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
print(Y_train.head())

(51923,) (51923,) (10385,) (10385,)
PRODUCT_ID
213558      900
1148999    1000
1940595     490
1339265    1102
501029      826
Name: CATEGORY, dtype: category
Categories (7303, int64): [1, 2, 3, 4, ..., 224928000, 393701000, 480315220, 1885801400]


We are using Batch Train Size of 50 Thousand Rows and Test_Size as 10 Thousand rows

In [10]:
print(X_train.head())

PRODUCT_ID
213558     Arlen Ness 17-110 Rear Adjustable Lowering Kit...
1148999    College Dorm Dust Ruffled Bed Skirt-Extra Long...
1940595    Sweejar Ceramic Baking Dish, Rectangular Lasag...
1339265    FAZZN Women's Casual Round Neck Full Sleeves K...
501029     Mens Comfortable Trending And Stylish Slipper ...
Name: DATA, dtype: object


## Feature Extraction 

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
ft_ext = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = 'True')
xtrf = ft_ext.fit_transform(X_train)
xtef = ft_ext.transform(X_test)
print(xtrf.shape, xtef.shape)

(51923, 123386) (10385, 123386)


In [12]:
print(xtrf)

  (0, 77644)	0.09609722097882067
  (0, 93523)	0.08159271785968844
  (0, 23856)	0.10013432041015582
  (0, 24266)	0.1044174370584644
  (0, 106468)	0.1147225623130106
  (0, 49880)	0.09791845013929815
  (0, 74134)	0.07340383580914842
  (0, 87573)	0.09747076186351943
  (0, 51285)	0.08618476045377414
  (0, 89318)	0.09009707926996774
  (0, 71653)	0.06843421097478097
  (0, 100988)	0.08601112069235213
  (0, 28607)	0.09507550368265455
  (0, 55523)	0.06309925368398014
  (0, 29010)	0.10050019400701264
  (0, 36373)	0.09336265822483598
  (0, 88251)	0.09742167556480613
  (0, 58495)	0.18308377523437724
  (0, 79719)	0.09584775755205716
  (0, 29391)	0.11393387190129345
  (0, 62478)	0.12184608756759065
  (0, 22449)	0.09113503032713353
  (0, 28651)	0.14976123400874333
  (0, 106871)	0.15595645427618549
  (0, 14402)	0.15284043072804626
  :	:
  (51922, 47824)	0.02569239383536994
  (51922, 50003)	0.028150991978616532
  (51922, 103129)	0.026169422981200834
  (51922, 50641)	0.018646880204936364
  (51922, 59688)

## Linear Support Vector Machine

In [14]:
model = LinearSVC()
model.fit(xtrf, Y_train)
print(accuracy_score(Y_train, model.predict(xtrf)))

0.9721318105656452


We obtained the accuracy of 97.21%

In [15]:
test_df = pd.read_csv("test.csv")
test_df = test_df.dropna()

In [16]:
print(test_df.head())
print(test_df.shape)


    PRODUCT_ID                                              TITLE  \
1      1729783  DCGARING Microfiber Throw Blanket Warm Fuzzy P...   
2      1871949  I-Match Auto Parts Front License Plate Bracket...   
3      1107571  PinMart Gold Plated Excellence in Service 1 Ye...   
5      2782548  Evershine Shoppee 10m Waxed Nylon Thread Cotto...   
11     2736605  SHASAK Sanganer Hand Block Printed Short Kurta...   

                                        BULLET_POINTS  \
1   [QUALITY GUARANTEED: Luxury cozy plush polyest...   
2   [Front License Plate Bracket Made Of Plastic,D...   
3   [Available as a single item or bulk packed. Se...   
5   [Kindly Refer The Product Description Before B...   
11  [Confused between wearing a T-shirt and a shir...   

                                          DESCRIPTION  PRODUCT_TYPE_ID  
1   <b>DCGARING Throw Blanket</b><br><br> <b>Size ...             1622  
2   Replacement for The Following Vehicles:2020 LE...             7540  
3   Our Excellence in S

In [17]:
test_df.columns

Index(['PRODUCT_ID', 'TITLE', 'BULLET_POINTS', 'DESCRIPTION',
       'PRODUCT_TYPE_ID'],
      dtype='object')

In [18]:
def prep_test(td):
    new_td = td.drop_duplicates()
    new_td['PRODUCT_ID'] = new_td.index + 1
    new_td.set_index('PRODUCT_ID', inplace=True)
    new_td['DATA'] = new_td['TITLE'].astype(str)+" "+new_td['DESCRIPTION'].astype(str)+" "+new_td['BULLET_POINTS'].astype(str)
    return new_td

In [19]:
new_td = prep_test(test_df)

In [20]:
from sklearn.metrics import mean_squared_error as mse
print(mse(Y_train, model.predict(xtrf)))

896013.5868690176


In [21]:
from sklearn.metrics import mean_absolute_percentage_error as mspe
print(mspe(Y_train, model.predict(xtrf)))

0.031001362018000092


From above we can understand that Mean_Squared Error is very high which suggests that model is having various outliers or we can say that some products have very high length such as Curtains, bedsheet, or various things and some have very small lengths. eg: Wrist watch, necklace etc.

In [22]:
new_df = new_td.dropna()
print(new_df.head())
print(new_df.shape)

                                                        TITLE  \
PRODUCT_ID                                                      
2           DCGARING Microfiber Throw Blanket Warm Fuzzy P...   
3           I-Match Auto Parts Front License Plate Bracket...   
4           PinMart Gold Plated Excellence in Service 1 Ye...   
6           Evershine Shoppee 10m Waxed Nylon Thread Cotto...   
12          SHASAK Sanganer Hand Block Printed Short Kurta...   

                                                BULLET_POINTS  \
PRODUCT_ID                                                      
2           [QUALITY GUARANTEED: Luxury cozy plush polyest...   
3           [Front License Plate Bracket Made Of Plastic,D...   
4           [Available as a single item or bulk packed. Se...   
6           [Kindly Refer The Product Description Before B...   
12          [Confused between wearing a T-shirt and a shir...   

                                                  DESCRIPTION  \
PRODUCT_ID             

In [28]:
X_new_test = new_df['DATA']
p = list()
Xtnef = ft_ext.transform(X_new_test[0:50000])
print(type(model.predict(Xtnef)))
for i in model.predict(Xtnef):
    p.append(i)

<class 'numpy.ndarray'>


In [30]:
final_df = pd.DataFrame()

In [31]:
Xtnef = ft_ext.transform(X_new_test[50000:100000])
for i in model.predict(Xtnef):
    p.append(i)

In [32]:
Xtnef = ft_ext.transform(X_new_test[100000:150000])
for i in model.predict(Xtnef):
    p.append(i)

In [33]:
Xtnef = ft_ext.transform(X_new_test[150000:200000])
for i in model.predict(Xtnef):
    p.append(i)

In [34]:
Xtnef = ft_ext.transform(X_new_test[200000:250000])
for i in model.predict(Xtnef):
    p.append(i)

In [35]:
Xtnef = ft_ext.transform(X_new_test[250000:300000])
for i in model.predict(Xtnef):
    p.append(i)

In [36]:
Xtnef = ft_ext.transform(X_new_test[300000:])
for i in model.predict(Xtnef):
    p.append(i)

In [37]:
final_df['PRODUCT_ID'] = test_df['PRODUCT_ID']
final_df['PRODUCT_LENGTH'] = p

In [38]:

final_df.to_csv(r"dataset")