In [29]:
# Import packages
 # import fuzzywizzy?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pymongo
import string
import nltk
import re
import os
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')



src_dir = os.path.join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

# helper functions
from d02_processing.cleaning_signatures import sorted_signatures
from d02_processing.cleaning_signatures import cleaned_signatures
from d01_utils.mongo_cursor_creator import mongo_cursor

# Load the "autoreload" extension
%load_ext autoreload

# reload modules so that as you change code in src, it gets loaded
%autoreload

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# import the unique signatures from the database
sigs_from_cursor = mongo_cursor()
sigs_from_cursor[0:5]

['',
 '2c/3a, fine texture, normal porosity, medium-to-thick\nCo wash: Vo5 Extra Body\nConditioners: Deva One Condition, GVP CB, Loreal Evercurl\nStylers: Devacurl ArcAngel, KCCC, AG Recoil, Iso Bouncy Creme, Ecostyler Krystal, CCCCL, KCKT\nLow Poo: Devacurl NoPoo, GTTT\nHair likes: plopping, diffusing, SMaster\'s, coconut and argan oil, honey, protein\n"What makes a woman unforgettable? Her mind...surrounded by lots of naturally curly hair"',
 'True 3B\ncoarse, overly porous, normal elasticity, thick\nOn the long transitioning road...\nDiscovered Curly: November 24, 2009:toothy3:\nHates: humectants, protein, rain, wind, humidity, plopping\nLoves: moisture, ecostyler, sallys conditioning balm\n\n:wav:\n\n\n\n\n\n\n\n\n',
 "Silence grows\nMy feelings flow\nI'm dreaming now\nOf all the things I know\n",
 "I've learned that no matter what happens, or how bad it seems today, life does go on, and it will be better tomorrow.\n\n3 b/c shoulder length hair. Black-brown color. It's always mista

In [5]:
# sort the signatures
raw_characteristics_df = sorted_signatures(sigs_from_cursor)
raw_characteristics_df.head()

Unnamed: 0,curl_pattern,density,porosity,texture,products
0,,,,,
1,2c,thick,normal,fine,to co wash vo5 extra body conditioners deva ...
2,3a,thick,normal,fine,to co wash vo5 extra body conditioners deva ...
3,3b,thick,,,true coarse overly porous normal elasticity on...
4,,thin,,,silence grows my feelings flow i m dreaming no...


In [6]:
cleaned_df = cleaned_signatures(raw_characteristics_df)
cleaned_df.head()

Unnamed: 0,curl_pattern,density,porosity,texture,products
1,2c,thick,normal,fine,to co wash vo5 extra body conditioners deva ...
2,3a,thick,normal,fine,to co wash vo5 extra body conditioners deva ...
3,3b,thick,,,true coarse overly porous normal elasticity on...
4,,thin,,,silence grows my feelings flow i m dreaming no...
9,4a,,,,toothy4 mix toothy4 bc 09 28 12 occasion9 gro...


In [54]:
# make a data fram with no nulls in the different characteristics
curl_pattern_no_nan_df = cleaned_df[pd.notnull(cleaned_df['curl_pattern'])]
curl_pattern_no_nan_df['curl_catagory'] = pd.Categorical(curl_pattern_no_nan_df.curl_pattern)
curl_pattern_no_nan_df["curl_catagory"] = curl_pattern_no_nan_df["curl_catagory"].cat.codes
print(curl_pattern_no_nan_df.describe())
curl_pattern_no_nan_df.head()

       curl_catagory
count    7405.000000
mean        4.342201
std         1.686252
min         0.000000
25%         3.000000
50%         4.000000
75%         5.000000
max         9.000000


Unnamed: 0,curl_pattern,density,porosity,texture,products,curl_catagory
1,2c,thick,normal,fine,to co wash vo5 extra body conditioners deva ...,3
2,3a,thick,normal,fine,to co wash vo5 extra body conditioners deva ...,4
3,3b,thick,,,true coarse overly porous normal elasticity on...,5
9,4a,,,,toothy4 mix toothy4 bc 09 28 12 occasion9 gro...,7
10,3c,,,,toothy4 mix toothy4 bc 09 28 12 occasion9 gro...,6


In [19]:
cleaned_df['curl_catagory'] = pd.Categorical(cleaned_df.curl_pattern)
cleaned_df.dtypes


curl_pattern       object
density            object
porosity           object
texture            object
products           object
curl_catagory    category
dtype: object

In [21]:
cleaned_df["curl_catagory"] = cleaned_df["curl_catagory"].cat.codes
cleaned_df.head()

Unnamed: 0,curl_pattern,density,porosity,texture,products,curl_catagory
1,2c,thick,normal,fine,to co wash vo5 extra body conditioners deva ...,3
2,3a,thick,normal,fine,to co wash vo5 extra body conditioners deva ...,4
3,3b,thick,,,true coarse overly porous normal elasticity on...,5
4,,thin,,,silence grows my feelings flow i m dreaming no...,-1
9,4a,,,,toothy4 mix toothy4 bc 09 28 12 occasion9 gro...,7


In [24]:
cleaned_df.products

1          to co wash vo5 extra body conditioners deva ...
2          to co wash vo5 extra body conditioners deva ...
3        true coarse overly porous normal elasticity on...
4        silence grows my feelings flow i m dreaming no...
9         toothy4 mix toothy4 bc 09 28 12 occasion9 gro...
10        toothy4 mix toothy4 bc 09 28 12 occasion9 gro...
11        afro i love4 water he my baby daddy is the de...
12        afro i love4 water he my baby daddy is the de...
13       hair stats i guess  shoulder length tends to f...
14       hair stats i guess  shoulder length tends to f...
15       bc d 6 5 2010 last relaxed 5 16 2009 mostly pr...
16                                             on the side
17        sigpic sigpic littlemisscurls shutterfly com ...
18        sigpic sigpic with little s thrown in to make...
19        sigpic sigpic with little s thrown in to make...
20       i k i m a definitely a corkscrew currently on ...
21        wavy baby hair low low poo lush karma komba h.

In [55]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 4), stop_words='english')
features = tfidf.fit_transform(curl_pattern_no_nan_df.products).toarray()
labels = curl_pattern_no_nan_df.curl_catagory
features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [56]:
# df['category_id'] = df['Product'].factorize()[0]
# from io import StringIO
# category_id_df = df[['Product', 'category_id']].drop_duplicates().sort_values('category_id')
# category_to_id = dict(category_id_df.values)
# id_to_category = dict(category_id_df[['category_id', 'Product']].values)

curl_catagory_df = curl_pattern_no_nan_df[['curl_pattern', 'curl_catagory']].drop_duplicates().sort_values('curl_catagory')
curl_cat_key_dict = dict(curl_catagory_df.values)
curl_pattern_key_dict = dict(curl_catagory_df[['curl_catagory', 'curl_pattern']].values)
print(curl_cat_key_dict)
print(curl_pattern_key_dict)

{'1c': 0, '2a': 1, '2b': 2, '2c': 3, '3a': 4, '3b': 5, '3c': 6, '4a': 7, '4b': 8, '4c': 9}
{0: '1c', 1: '2a', 2: '2b', 3: '2c', 4: '3a', 5: '3b', 6: '3c', 7: '4a', 8: '4b', 9: '4c'}


In [60]:
N = 2
for curl_pattern, curl_cat in sorted(curl_cat_key_dict.items()):
    features_chi2 = chi2(features, labels == curl_cat)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
    quadgrams = [v for v in feature_names if len(v.split(' ')) == 4]
    print("# '{}':".format(curl_pattern))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
    print("  . Most correlated trigrams:\n. {}".format('\n. '.join(trigrams[-N:])))
    print("  . Most correlated quadgrams:\n. {}".format('\n. '.join(quadgrams[-N:])))

# '1c':
  . Most correlated unigrams:
. hell
. sense
  . Most correlated bigrams:
. com youtube
. shampoo weeks
  . Most correlated trigrams:
. jessicurl cleansing cream
. psalm 139 14
  . Most correlated quadgrams:
. poo mop hydrating shampoo
. tresemme naturals moisturizing conditioner
# '2a':
  . Most correlated unigrams:
. wavy
. elephant
  . Most correlated bigrams:
. canopy underneath
. wavy hair
  . Most correlated trigrams:
. healthy sexy hair
. ro ss caitlin
  . Most correlated quadgrams:
. normal elasticity low poo
. la looks mega hold
# '2b':
  . Most correlated unigrams:
. elasticity
. wavy
  . Most correlated bigrams:
. shampoo body
. ness low
  . Most correlated trigrams:
. beauticurls li curl
. shampoo body shop
  . Most correlated quadgrams:
. wash vo5 kiwi lime
. yes cucumbers color protection
# '2c':
  . Most correlated unigrams:
. relaxer
. coarse
  . Most correlated bigrams:
. naturals ro
. normal elasticity
  . Most correlated trigrams:
. coarse normal elasticity
.

In [58]:
X = cleaned_df[['density', 'porosity', 'products', 'texture']]
y = cleaned_df.curl_catagory
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

TypeError: take_nd() got an unexpected keyword argument 'axis'

In [44]:
feature_names = np.array(tfidf.get_feature_names())
len(feature_names)

10794