## Amazon data 

In [1]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/sample/meta_Computers.json.gz

--2022-05-24 09:18:23--  http://deepyeti.ucsd.edu/jianmo/amazon/sample/meta_Computers.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7010521 (6.7M) [application/octet-stream]
Saving to: ‘meta_Computers.json.gz’


2022-05-24 09:18:25 (4.09 MB/s) - ‘meta_Computers.json.gz’ saved [7010521/7010521]



In [3]:
### load the meta data

data = []
with gzip.open('meta_Computers.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print(len(data))

# first row of the list
print(data[0])

18772
{'description': ['Brand new. Still in factory-sealed plastic.'], 'title': 'Kelby Training DVD: The Adobe Photoshop CS5 7-Point System for Camera Raw By Scott Kelby', 'image': ['https://images-na.ssl-images-amazon.com/images/I/31IlLImCVJL._SS40_.jpg'], 'brand': 'Kelby Training', 'rank': ['>#654,681 in Electronics (See Top 100 in Electronics)'], 'main_cat': 'Computers', 'date': 'December 2, 2011', 'asin': '0321732960'}


In [5]:
# convert list into pandas dataframe
df = pd.DataFrame.from_dict(data)
def list_to_pd_dataframe(df):
#     df = pd.DataFrame.from_dict(data)
    df3 = df.fillna('')
    df5 = df3[~df3.title.str.contains('getTime')] # filter those unformatted rows
    return df5

df5=list_to_pd_dataframe(df)

In [6]:
def cleandata(df5):
    df5.description=df5.description.map(lambda x: str(x))
    df5.description=df5.description.str.strip("[]'")
    df5.drop(columns=['feature', 'tech1', 'also_buy', 'price', 'also_view', 'tech2','details', 'similar_item',"main_cat"],inplace=True)
    df5['brand_cat']=df5.brand.map(lambda x : 0 if len(x) <1 else 1)
    df5['rank1']=df5['rank'].map(lambda x: str(x).strip("[]'"))
    df5["rank1"]=df5["rank1"].map(lambda x: str(x))
    df5["rank1"]=df5["rank1"].str.replace(pat='>#',repl='', regex=False)
    df5["rank1"]=df5["rank1"].map(lambda x: x.replace(",",""))
    df5["rank1"]=df5["rank1"].map(lambda x: x.split("in"))
    df5["rank1"] = df5["rank1"].map(lambda x: x[0])
    df5["title_count"] = df5["title"].map(lambda x: len(x))
    df5["desc_count"] = df5["description"].map(lambda x: len(x))
    df5["img_count"] = df5["image"].map(lambda x: len(x))
    df6=df5[df5["rank1"]!='']
    df6["rank1"]=df6["rank1"].astype("int64")
    return df6

df6=cleandata(df5)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6["rank1"]=df6["rank1"].astype("int64")


In [7]:
X = df6[["brand_cat","title_count","desc_count","img_count"]]
y=df6["rank1"]
         
         
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
X_train.shape,X_test.shape

model = LinearRegression().fit(X_train, y_train)

## Clean Data

In [10]:
df6.describe()

Unnamed: 0,brand_cat,rank1,title_count,desc_count,img_count
count,17952.0,17952.0,17952.0,17952.0,17952.0
mean,0.974265,592975.2,112.436776,129.966633,2.968861
std,0.158349,491224.6,71.24236,350.335807,2.374704
min,0.0,54.0,0.0,0.0,0.0
25%,1.0,201725.0,69.0,0.0,1.0
50%,1.0,473359.5,97.0,0.0,3.0
75%,1.0,848467.5,144.0,9.0,5.0
max,1.0,9464714.0,2000.0,5146.0,46.0


## Using SVM Model

In [22]:
from sklearn.svm import SVC

In [24]:
print(df6.rank1.median())
df6["rank_cat"]=df6["rank1"].map(lambda x: 1 if x<473359 else 0)

473359.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6["rank_cat"]=df6["rank1"].map(lambda x: 1 if x<473359 else 0)


In [None]:
X = df6[["brand_cat","title_count","desc_count","img_count"]]
y=df6["rank_cat"]

X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3)

svc=SVC(kernel="linear",C=10)
model=svc.fit(X_train,y_train)
model.score(X_test,y_test)

## SVM Regressors