## Amazon data 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [3]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/sample/meta_Computers.json.gz

--2022-05-24 07:36:55--  http://deepyeti.ucsd.edu/jianmo/amazon/sample/meta_Computers.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7010521 (6.7M) [application/octet-stream]
Saving to: ‘meta_Computers.json.gz’


2022-05-24 07:36:55 (14.2 MB/s) - ‘meta_Computers.json.gz’ saved [7010521/7010521]



In [3]:
### load the meta data

data = []
with gzip.open('meta_Computers.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print(len(data))

# first row of the list
print(data[0])

18772
{'description': ['Brand new. Still in factory-sealed plastic.'], 'title': 'Kelby Training DVD: The Adobe Photoshop CS5 7-Point System for Camera Raw By Scott Kelby', 'image': ['https://images-na.ssl-images-amazon.com/images/I/31IlLImCVJL._SS40_.jpg'], 'brand': 'Kelby Training', 'rank': ['>#654,681 in Electronics (See Top 100 in Electronics)'], 'main_cat': 'Computers', 'date': 'December 2, 2011', 'asin': '0321732960'}


In [4]:
# convert list into pandas dataframe
df = pd.DataFrame.from_dict(data)
def list_to_pd_dataframe(df):
#     df = pd.DataFrame.from_dict(data)
    df3 = df.fillna('')
    df5 = df3[~df3.title.str.contains('getTime')] # filter those unformatted rows
    return df5

df5=list_to_pd_dataframe(df)

In [5]:
def cleandata(df5):
    df5.description=df5.description.map(lambda x: str(x))
    df5.description=df5.description.str.strip("[]'")
    df5.drop(columns=['feature', 'tech1', 'also_buy', 'price', 'also_view', 'tech2','details', 'similar_item',"main_cat"],inplace=True)
    df5['brand_cat']=df5.brand.map(lambda x : 0 if len(x) <1 else 1)
    df5['rank1']=df5['rank'].map(lambda x: str(x).strip("[]'"))
    df5["rank1"]=df5["rank1"].map(lambda x: str(x))
    df5["rank1"]=df5["rank1"].str.replace(pat='>#',repl='', regex=False)
    df5["rank1"]=df5["rank1"].map(lambda x: x.replace(",",""))
    df5["rank1"]=df5["rank1"].map(lambda x: x.split("in"))
    df5["rank1"] = df5["rank1"].map(lambda x: x[0])
    df5["title_count"] = df5["title"].map(lambda x: len(x))
    df5["desc_count"] = df5["description"].map(lambda x: len(x))
    df5["img_count"] = df5["image"].map(lambda x: len(x))
    df6=df5[df5["rank1"]!='']
    df6["rank1"]=df6["rank1"].astype("int64")
    return df6

df6=cleandata(df5)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6["rank1"]=df6["rank1"].astype("int64")


In [6]:
X = df6[["brand_cat","title_count","desc_count","img_count"]]
y=df6["rank1"]
         
         
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
X_train.shape,X_test.shape

model = LinearRegression().fit(X_train, y_train)

## Clean Data

In [7]:
df6.describe()

Unnamed: 0,brand_cat,rank1,title_count,desc_count,img_count
count,17952.0,17952.0,17952.0,17952.0,17952.0
mean,0.974265,592975.2,112.436776,129.966633,2.968861
std,0.158349,491224.6,71.24236,350.335807,2.374704
min,0.0,54.0,0.0,0.0,0.0
25%,1.0,201725.0,69.0,0.0,1.0
50%,1.0,473359.5,97.0,0.0,3.0
75%,1.0,848467.5,144.0,9.0,5.0
max,1.0,9464714.0,2000.0,5146.0,46.0


In [8]:
df6["rank_cat"]=df6["rank1"].map(lambda x: 1 if x<473359 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6["rank_cat"]=df6["rank1"].map(lambda x: 1 if x<473359 else 0)


## Using SVM Model

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
print(df6.rank1.median())
df6["rank_cat"]=df6["rank1"].map(lambda x: 1 if x<473359 else 0)

473359.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
X = df6[["brand_cat","title_count","desc_count","img_count"]]
y=df6["rank_cat"]

X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
svc=SVC(kernel="linear",C=10,gamma='auto')
model=svc.fit(X_train,y_train)
model.score(X_test,y_test)

0.6154845896769402

Using GridSearchCV for SVM

In [None]:
X = df6[["brand_cat","title_count","desc_count","img_count"]]
y=df6["rank_cat"]

sc = StandardScaler()
X = sc.fit_transform(X)

clf = GridSearchCV(SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)
clf.fit(X, y)
clf.cv_results_

{'mean_fit_time': array([11.45177612,  8.69223723, 14.66475024, 18.01703067, 19.01169171,
        46.24699411]),
 'mean_score_time': array([2.47901406, 0.89608102, 2.16860213, 0.8154285 , 2.11554718,
        0.80367575]),
 'mean_test_score': array([0.63324276, 0.59491854, 0.63402258, 0.5997077 , 0.63396694,
        0.60856575]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],
 'rank_test_score': array([3, 6, 1, 5, 2, 4], dtype=int32),
 'split0_test_score': array([0.63464216

In [None]:
df = pd.DataFrame(clf.cv_results_)
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.633243
1,1,linear,0.594919
2,10,rbf,0.634023
3,10,linear,0.599708
4,20,rbf,0.633967
5,20,linear,0.608566


In [None]:
clf.best_params_,clf.best_score_

({'C': 10, 'kernel': 'rbf'}, 0.6340225835402495)

## SVM Regressors

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

In [None]:
X = df6[["brand_cat","title_count","desc_count","img_count"]]
y=df6["rank1"]

X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

regressor = SVR(epsilon=0.1, C=1, kernel='linear',gamma='auto')

model=regressor.fit(X_train,y_train)
model.score(X_test,y_test)


-0.05607450542672132

GridSearchCV for SVM Regressor 

In [None]:
X = df6[["brand_cat","title_count","desc_count","img_count"]]
y=df6["rank1"]

sc = StandardScaler()
X = sc.fit_transform(X)

model = GridSearchCV(SVR(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)
model.fit(X,y)
model.cv_results_

{'mean_fit_time': array([10.57959123,  7.41503916, 13.48051529,  6.72247534, 10.61148324,
         6.43336458]),
 'mean_score_time': array([2.60348535, 1.29092498, 2.64556274, 0.84663987, 2.61051817,
        0.84400716]),
 'mean_test_score': array([-0.22355136, -0.21834267, -0.21673908, -0.17909517, -0.20938667,
        -0.15840706]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],
 'rank_test_score': array([6, 5, 4, 2, 3, 1], dtype=int32),
 'split0_test_score': array([-0.8

In [None]:
df = pd.DataFrame(model.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,10.579591,0.125827,2.603485,0.041293,1,rbf,"{'C': 1, 'kernel': 'rbf'}",-0.81771,-0.174516,-0.001102,-0.010506,-0.113923,-0.223551,0.304079,6
1,7.415039,1.322443,1.290925,0.590136,1,linear,"{'C': 1, 'kernel': 'linear'}",-0.810969,-0.17121,0.002779,-0.006363,-0.10595,-0.218343,0.303279,5
2,13.480515,2.557852,2.645563,0.095696,10,rbf,"{'C': 10, 'kernel': 'rbf'}",-0.80998,-0.168476,0.003838,-0.005797,-0.103281,-0.216739,0.30341,4
3,6.722475,0.483533,0.84664,0.008755,10,linear,"{'C': 10, 'kernel': 'linear'}",-0.761622,-0.135571,0.030453,0.025228,-0.053964,-0.179095,0.297506,2
4,10.611483,0.234025,2.610518,0.027111,20,rbf,"{'C': 20, 'kernel': 'rbf'}",-0.800227,-0.161249,0.008842,-0.000914,-0.093386,-0.209387,0.301977,3
5,6.433365,0.253086,0.844007,0.007571,20,linear,"{'C': 20, 'kernel': 'linear'}",-0.727679,-0.118442,0.044671,0.040662,-0.031247,-0.158407,0.29076,1


In [None]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,-0.223551
1,1,linear,-0.218343
2,10,rbf,-0.216739
3,10,linear,-0.179095
4,20,rbf,-0.209387
5,20,linear,-0.158407


In [None]:
model.best_params_

{'C': 20, 'kernel': 'linear'}

In [None]:
model.best_score_

-0.1584070647284664

## Conclusion


We should use classification for our model, not regression

## TFIDF vectorizer

In [None]:
df6.head(2)

Unnamed: 0,description,title,image,brand,rank,date,asin,brand_cat,rank1,title_count,desc_count,img_count,rank_cat
0,Brand new. Still in factory-sealed plastic.,Kelby Training DVD: The Adobe Photoshop CS5 7-...,[https://images-na.ssl-images-amazon.com/image...,Kelby Training,"[>#654,681 in Electronics (See Top 100 in Elec...","December 2, 2011",321732960,1,654681,88,43,1,0
1,"""If you're already a Photoshop user and want a...",Kelby Training DVD: Adobe Photoshop CS5 Power ...,[https://images-na.ssl-images-amazon.com/image...,Kelby Training,"[>#830,165 in Electronics (See Top 100 in Elec...","November 7, 2011",321735722,1,830165,72,257,1,0


In [None]:
# !pip install nltk

import nltk
from deep_seo.utils import punc,lower,lemmatize,nonumbers,stopword
from sklearn.feature_extraction.text import TfidfVectorizer

### TFIDF for brand, description & title column 

In [None]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

In [None]:
def full_text_processing(columns):
    a=""
    for i in columns:
        a=a+i 
    a=punc(a)
    a=lower(a)
    a=nonumbers(a)
    a=stopword(a)
    a=lemmatize(a)
    b=[a]
    vectorizer = TfidfVectorizer(max_features = 100)
    X = vectorizer.fit_transform(b) #brand
    return X

full_text_processing(df6.brand)


<1x100 sparse matrix of type '<class 'numpy.float64'>'
	with 100 stored elements in Compressed Sparse Row format>

In [None]:
X=full_text_processing(df6.description)
X.toarray()
aa=pd.DataFrame(X.toarray(),columns = vectorizer.get_feature_names())
aa.columns



Index(['android', 'bag', 'battery', 'boxalucky', 'brand', 'cable', 'case',
       'casetop', 'city', 'co', 'coltdbestrunner', 'coltdyuyao',
       'communication', 'company', 'companylike', 'componentsatech',
       'computer', 'creative', 'decal', 'digital', 'directfactory', 'doctor',
       'dragon', 'drive', 'duang', 'earthtechno', 'electronic', 'electronics',
       'electronicsaurora', 'electronicsseifelden', 'etcessentials',
       'factoryhotcool', 'flash', 'frogice', 'fujitsu', 'future', 'gb',
       'global', 'hardwareintelligent', 'high', 'hitachi', 'holster',
       'hqlaptop', 'industrial', 'industry', 'international', 'inventory',
       'ipad', 'junsi', 'knightrikki', 'koolcase', 'laptop', 'lifetime',
       'limitedues', 'llccactus', 'llcfocus', 'ltddongguan', 'ltdsingo',
       'ltdtsmine', 'maidun', 'manufacturer', 'maxtordepending', 'medium',
       'memory', 'memorymemzi', 'metal', 'micro', 'mini', 'moregraphics',
       'network', 'nutra', 'partsbay', 'pc', 'piraspb

In [None]:
full_text_processing(df6.title)

<1x100 sparse matrix of type '<class 'numpy.float64'>'
	with 100 stored elements in Compressed Sparse Row format>

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB


# Set X and y
X = df6[["brand_cat","title_count","desc_count","img_count","title","description","brand"]]
y=df6["rank_cat"]

X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3)

# initialise model and vectorizers
model = MultinomialNB()
vectorizer1 = TfidfVectorizer()
vectorizer2 = TfidfVectorizer()
vectorizer3 = TfidfVectorizer()
scaler1 = StandardScaler()
scaler2 = StandardScaler()

# construct the column transfomer
column_transformer = ColumnTransformer(
    [('tfidf1', vectorizer1, 'title'), 
     ('tfidf2', vectorizer2, 'description'),
    ('tfidf3', vectorizer3, 'brand')],
    remainder='passthrough')

# fit the model
pipe = Pipeline([
                  ('tfidf', column_transformer),
                  ('classify', model)
                ])
pipe.fit(X_train,y_train)

In [23]:
pipe.score(X_test,y_test)

0.5770516152989231

## Pipeline

In [None]:
df6.columns
# numerical>['title_count', 'desc_count', 'img_count']
# categorical>['brand_cat']

Index(['description', 'title', 'image', 'brand', 'rank', 'date', 'asin',
       'brand_cat', 'rank1', 'title_count', 'desc_count', 'img_count'],
      dtype='object')

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [None]:
X = df6[["brand_cat","title_count","desc_count","img_count"]]
y=df6["rank"]

In [None]:
num_transformer = make_pipeline(StandardScaler())

preproc_basic = make_column_transformer((num_transformer, ['title_count', 'desc_count','img_count']),
                                       (full_text_processing, ['title', 'description','brand']),
                                       remainder='passthrough')

pipe = make_pipeline(preproc_basic, LogisticRegression())
pipe

In [None]:
transformed_features=preproc_basic.fit_transform(X)

TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. '<function full_text_processing at 0x13465f550>' (type <class 'function'>) doesn't.

In [None]:
df6["rank1"]

0         654681
1         830165
2        1233887
3         454595
4          85661
          ...   
18767     372769
18768     308759
18769       4064
18770     449492
18771     278362
Name: rank1, Length: 17952, dtype: int64

In [None]:
pd.cut(df6["rank1"],10)

0         (-9410.66, 946520.0]
1         (-9410.66, 946520.0]
2        (946520.0, 1892986.0]
3         (-9410.66, 946520.0]
4         (-9410.66, 946520.0]
                 ...          
18767     (-9410.66, 946520.0]
18768     (-9410.66, 946520.0]
18769     (-9410.66, 946520.0]
18770     (-9410.66, 946520.0]
18771     (-9410.66, 946520.0]
Name: rank1, Length: 17952, dtype: category
Categories (10, interval[float64, right]): [(-9410.66, 946520.0] < (946520.0, 1892986.0] < (1892986.0, 2839452.0] < (2839452.0, 3785918.0] ... (5678850.0, 6625316.0] < (6625316.0, 7571782.0] < (7571782.0, 8518248.0] < (8518248.0, 9464714.0]]

## Deep Learning model

In [30]:
from tensorflow.keras.utils import to_categorical 

In [10]:
import numpy as np 

df6['rank_binss'] = pd.cut(df6['rank1'], bins = 40, labels=[i for i in range(1,41)],include_lowest=True).astype('str')
df6 = df6.sort_values(by='rank1')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
df6['rank_binss']=df6['rank_binss'].map(lambda x: x.replace("-9410.661","0"))
df6['rank_binss']=df6['rank_binss'].map(lambda x: x.replace("40","10"))
df6['rank_binss'].value_counts()

1     5074
2     3900
3     3185
4     2018
5     1077
6     1021
7      975
8      577
9      124
10       1
Name: rank_binss, dtype: int64

In [12]:
from tensorflow.keras import Sequential, layers

In [29]:
df6.rank_binss=df6.rank_binss.astype("int64")
df6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17952 entries, 13181 to 7670
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  17952 non-null  object
 1   title        17952 non-null  object
 2   image        17952 non-null  object
 3   brand        17952 non-null  object
 4   rank         17952 non-null  object
 5   date         17952 non-null  object
 6   asin         17952 non-null  object
 7   brand_cat    17952 non-null  int64 
 8   rank1        17952 non-null  int64 
 9   title_count  17952 non-null  int64 
 10  desc_count   17952 non-null  int64 
 11  img_count    17952 non-null  int64 
 12  rank_cat     17952 non-null  int64 
 13  rank_binss   17952 non-null  int64 
dtypes: int64(7), object(7)
memory usage: 2.1+ MB


In [32]:
X = df6[["brand_cat","title_count","desc_count","img_count"]]
y=df6["rank_binss"]
y_cat=to_categorical(y)
         
X_train,X_test,y_train,y_test = train_test_split(X,y_cat,test_size=0.3)
X_train.shape,X_test.shape,y_train.shape

((12566, 4), (5386, 4), (12566, 11))

In [39]:
model = Sequential()
model.add(layers.Dense(100, activation='relu', input_dim=4)) #input_dim=no of features
model.add(layers.Dense(80, activation='relu'))
model.add(layers.Dense(30, activation='linear'))
model.add(layers.Dense(11, activation='softmax'))

In [40]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'])

In [45]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=20)

In [46]:
model.fit(X_train, y_train,epochs=100, 
          validation_split=0.3,
          callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100


<keras.callbacks.History at 0x7f7c71ad9dd0>

In [47]:

model.evaluate(X_test, y_test)



[1.8034168481826782, 0.2907538115978241]