## Analysis of data from alpha_crawler


In [26]:
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from textblob import Word


**Load Data**

In [87]:
data = pd.read_csv('./tmp/scrapped_data/data1.csv')
data[:3]

Unnamed: 0,title,price,image_urls
0,Red Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/vElrHfaJXtTRe2z8c3C0ePr7Mk...
1,Brown Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/17C83I4ziKVIC46gQQYUDt7GtC...
2,Navy Blue Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/rgpDMpDbA4XcfjeejILpRCLLnP...


In [88]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9869 entries, 0 to 9868
Data columns (total 3 columns):
title         9869 non-null object
price         9869 non-null float64
image_urls    9869 non-null object
dtypes: float64(1), object(2)
memory usage: 231.4+ KB


** Pre-process text**

**Transform into lower case, remove `'` & `-`**

In [89]:
data['label'] = data['title'].apply(lambda x: str(x).lower().replace("-",""))
data[:5]

Unnamed: 0,title,price,image_urls,label
0,Red Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/vElrHfaJXtTRe2z8c3C0ePr7Mk...,red leather 2 in 1 lady's handbag
1,Brown Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/17C83I4ziKVIC46gQQYUDt7GtC...,brown leather 2 in 1 lady's handbag
2,Navy Blue Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/rgpDMpDbA4XcfjeejILpRCLLnP...,navy blue leather 2 in 1 lady's handbag
3,Black Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/IIJRWlER6Bid6XJgYPzMhhlrTs...,black leather 2 in 1 lady's handbag
4,Women Quartz Watch Simple Ultra-thin Dial Fema...,2800.0,https://ke.jumia.is/B7-ZTG8XaUT45bTom3nioOFfnw...,women quartz watch simple ultrathin dial femal...


**Remove stop words**

In [90]:
stop = stopwords.words('english')
data['label'] = data['label'].apply(lambda x: " ".join(x for x in str(x).split() if x not in stop))
data[:3]

Unnamed: 0,title,price,image_urls,label
0,Red Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/vElrHfaJXtTRe2z8c3C0ePr7Mk...,red leather 2 1 lady's handbag
1,Brown Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/17C83I4ziKVIC46gQQYUDt7GtC...,brown leather 2 1 lady's handbag
2,Navy Blue Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/rgpDMpDbA4XcfjeejILpRCLLnP...,navy blue leather 2 1 lady's handbag


**Lemmatize**

In [91]:
data['label'] = data['label'].apply(lambda x: " ".join([Word(word).lemmatize() for word in str(x).split()]))
data[:3]

Unnamed: 0,title,price,image_urls,label
0,Red Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/vElrHfaJXtTRe2z8c3C0ePr7Mk...,red leather 2 1 lady's handbag
1,Brown Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/17C83I4ziKVIC46gQQYUDt7GtC...,brown leather 2 1 lady's handbag
2,Navy Blue Leather 2 in 1 Lady's Handbag,3500.0,https://ke.jumia.is/rgpDMpDbA4XcfjeejILpRCLLnP...,navy blue leather 2 1 lady's handbag


**Most featured words**

In [92]:
common = pd.Series(' '.join(data['label']).split()).value_counts()[:11]
common

woman      5451
fashion    2212
size:      1949
sexy       1936
shoe       1864
dress      1787
skirt      1707
long       1684
sleeve     1613
casual     1532
summer     1275
dtype: int64

In [94]:
df = data[['label','price','image_urls']]
df.t

Unnamed: 0,label,price,image_urls
0,red leather 2 1 lady's handbag,3500.0,https://ke.jumia.is/vElrHfaJXtTRe2z8c3C0ePr7Mk...
1,brown leather 2 1 lady's handbag,3500.0,https://ke.jumia.is/17C83I4ziKVIC46gQQYUDt7GtC...
2,navy blue leather 2 1 lady's handbag,3500.0,https://ke.jumia.is/rgpDMpDbA4XcfjeejILpRCLLnP...
3,black leather 2 1 lady's handbag,3500.0,https://ke.jumia.is/IIJRWlER6Bid6XJgYPzMhhlrTs...
4,woman quartz watch simple ultrathin dial femal...,2800.0,https://ke.jumia.is/B7-ZTG8XaUT45bTom3nioOFfnw...
5,quartz watch simple ultrathin dial men's wrist...,2800.0,https://ke.jumia.is/6UPXEochdRPOFnMd3HsL-fGWpo...
6,grey leather 2 1 lady's handbag,3500.0,https://ke.jumia.is/N6tGygTnf4r4RySVGA-wGtKWhc...
7,peach leather 2 1 lady's handbag,3500.0,https://ke.jumia.is/9rjsfpn5PQv3u-hQFLZs0N9uA3...
8,quartz watch simple ultrathin dial unisex wris...,2800.0,https://ke.jumia.is/vONc1EQ4h2c9FOcxakrntUiCBW...
9,regular analog watch unisex,2900.0,https://ke.jumia.is/osakmDd6pgaI7g1YCN_tY67Ngc...
