## Analysis of Fashion Text

In [14]:
import numpy as np
import pandas as pd

In [17]:
df = pd.read_csv('../data/w-fashion',index_col=0)

df[:3]

Unnamed: 0,img_link,name,price
0,https://ke.jumia.is/pLMwunB9ALtsfocE238j1DBn4_...,Air Mesh Lady Training Dance Shoes Light Weigh...,KSh 3300
1,https://ke.jumia.is/v-XE6DB1Vo2_uCqmyFAam7k-dN...,Print Off Shoulder Women Dress - Light Green,KSh 700
2,https://ke.jumia.is/4GtkG8rZz81JUvUmOZ6dgu0y0U...,Women Cashmeres Belted Wool Coat - Brown,KSh 1800 - KSh 2200


#####  Word Count

In [18]:
df['word_count'] = df['name'].apply(lambda x: len(str(x).split(" ")))
df[['name','word_count']][:3]

Unnamed: 0,name,word_count
0,Air Mesh Lady Training Dance Shoes Light Weigh...,13
1,Print Off Shoulder Women Dress - Light Green,8
2,Women Cashmeres Belted Wool Coat - Brown,7


## Basic pre-processing

**Stop Words**

In [124]:
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')

In [126]:
df['stopwords'] = df['name'].apply(lambda x: len([x for x in str(x).split() if x in stop]))

df[['name','stopwords']][:3]


Unnamed: 0,name,stopwords
0,air mesh lady training dance shoes light weigh...,1
1,print off shoulder women dress light green,1
2,women cashmeres belted wool coat brown,0


**Transform into lower case, remove `'` & `-`**

In [127]:
df['name'] = df['name'].apply(lambda x: str(x).lower().replace("[-,', ]",""))

df['name'][:3]

0    air mesh lady training dance shoes light weigh...
1          print off shoulder women dress  light green
2              women cashmeres belted wool coat  brown
Name: name, dtype: object

**Remove stop words**

In [128]:
df['name2'] = df['name'].apply(lambda x: " ".join(x for x in str(x).split() if x not in stop))
df[['name','stopwords','name2']][:3]

Unnamed: 0,name,stopwords,name2
0,air mesh lady training dance shoes light weigh...,1,air mesh lady training dance shoes light weigh...
1,print off shoulder women dress light green,1,print shoulder women dress light green
2,women cashmeres belted wool coat brown,0,women cashmeres belted wool coat brown


**Most Common words**

In [129]:
most_common = pd.Series(' '.join(df['name2']).split()).value_counts()[:11]
most_common


women      1858
shoes      1048
high        949
skirt       768
womens      742
casual      705
dress       621
fashion     550
new         504
heels       482
pants       472
dtype: int64

**Rare words**

In [130]:
rare_words = pd.Series(' '.join(df['name2']).split()).value_counts()[-10:]
rare_words

modeling        1
black(int:l)    1
lean            1
bodybon         1
0167            1
sloth           1
joint           1
beltapricot     1
buttocksrose    1
heavy           1
dtype: int64

**Stemming**

stripping suffixes - ing , s, ly

In [131]:
from nltk.stem import PorterStemmer

In [132]:
st = PorterStemmer()

In [133]:
df['name2'][:3]

0    air mesh lady training dance shoes light weigh...
1               print shoulder women dress light green
2               women cashmeres belted wool coat brown
Name: name2, dtype: object

In [134]:
df['text3'] = df['name'].apply(lambda x: " ".join([st.stem(word) for word in str(x).split()]))
df['text3'][:3]

0    air mesh ladi train danc shoe light weight spo...
1           print off shoulder women dress light green
2                   women cashmer belt wool coat brown
Name: text3, dtype: object

**Lemmatization**

 converting a word into its root word

In [135]:
from textblob import Word

In [136]:
df['text2'] = df['name2'].apply(lambda x: " ".join([Word(word).lemmatize() for word in str(x).split()])) 
df['text2'][:3]

0    air mesh lady training dance shoe light weight...
1               print shoulder woman dress light green
2                woman cashmere belted wool coat brown
Name: text2, dtype: object

In [137]:
df[['name2','text2','text3']][:3]

Unnamed: 0,name2,text2,text3
0,air mesh lady training dance shoes light weigh...,air mesh lady training dance shoe light weight...,air mesh ladi train danc shoe light weight spo...
1,print shoulder women dress light green,print shoulder woman dress light green,print off shoulder women dress light green
2,women cashmeres belted wool coat brown,woman cashmere belted wool coat brown,women cashmer belt wool coat brown


**Most commmon words after Lemmatizing**

In [138]:
common = pd.Series(' '.join(df['text2']).split()).value_counts()[:11]
common2

woman      2677
shoe       1084
high        949
skirt       855
heel        739
dress       712
casual      705
fashion     550
pant        515
new         504
sexy        453
dtype: int64

In [139]:
rare2 = pd.Series(' '.join(df['text2']).split()).value_counts()[-11:]
rare2

bk/37black    1
(thin         1
flipflop      1
zwm1281a      1
jegging       1
luxurious     1
shoesarmy     1
draped        1
rosy          1
0188          1
black&red     1
dtype: int64

**N-grams**

combination of multiple words used together

In [142]:
from textblob import TextBlob

In [154]:
all_words = pd.Series(' '.join(df['text2']).split())
type(all_words)

pandas.core.series.Series

In [161]:
wiki = TextBlob(df['text2'][0]).ngrams(3)
wiki

[WordList(['air', 'mesh', 'lady']),
 WordList(['mesh', 'lady', 'training']),
 WordList(['lady', 'training', 'dance']),
 WordList(['training', 'dance', 'shoe']),
 WordList(['dance', 'shoe', 'light']),
 WordList(['shoe', 'light', 'weight']),
 WordList(['light', 'weight', 'sport']),
 WordList(['weight', 'sport', 'shoe']),
 WordList(['sport', 'shoe', 'woman']),
 WordList(['shoe', 'woman', 'white'])]

In [176]:
wiki.words

AttributeError: 'list' object has no attribute 'words'

In [77]:
from sklearn.feature_extraction.text import CountVectorizer

In [78]:
vect = CountVectorizer()

In [79]:
vect.fit(df['name2'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [84]:
print(len(vect.vocabulary_))



2276


In [94]:
vect.vocabulary_

{'air': 150,
 'mesh': 1208,
 'lady': 1086,
 'training': 2069,
 'dance': 592,
 'shoes': 1702,
 'light': 1131,
 'weight': 2194,
 'sport': 1890,
 'women': 2220,
 'white': 2198,
 'print': 1483,
 'shoulder': 1747,
 'dress': 634,
 'green': 876,
 'cashmeres': 435,
 'belted': 262,
 'wool': 2231,
 'coat': 496,
 'brown': 366,
 'polka': 1469,
 'dot': 627,
 'long': 1147,
 'sleeve': 1831,
 'patchwork': 1402,
 'black': 273,
 'elegant': 683,
 'sheath': 1682,
 'cape': 422,
 'bohemian': 303,
 'maxi': 1200,
 'blue': 283,
 'fashion': 727,
 'sleeveless': 1833,
 'striped': 1953,
 'belt': 259,
 'ol': 1315,
 'short': 1735,
 'blazer': 275,
 'female': 734,
 'business': 380,
 'pencil': 1416,
 'big': 265,
 'swing': 1994,
 'red': 1543,
 'pleated': 1452,
 'frill': 822,
 'ruched': 1596,
 'high': 954,
 'waist': 2169,
 'skirt': 1790,
 'womens': 2226,
 'retro': 1551,
 'bodycon': 297,
 'purple': 1520,
 'official': 1310,
 'suit': 1967,
 'teal': 2011,
 'floral': 792,
 'yellow': 2255,
 'denim': 605,
 'trouser': 2085,
 'to

In [96]:
bag_of_words = vect.transform(df['name2'])
bag_of_words

<3730x2276 sparse matrix of type '<class 'numpy.int64'>'
	with 39489 stored elements in Compressed Sparse Row format>

In [98]:
bag_of_words.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])