In [13]:
# Import packages
 # import fuzzywizzy?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pymongo
import string
import nltk
import re
import os
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import warnings
warnings.filterwarnings('ignore')



src_dir = os.path.join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

# helper functions
from d02_processing.cleaning_signatures import sorted_signatures
from d02_processing.cleaning_signatures import cleaned_signatures
from d01_utils.mongo_cursor_creator import mongo_cursor

# Load the "autoreload" extension
%load_ext autoreload

# reload modules so that as you change code in src, it gets loaded
%autoreload

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# import the unique signatures from the database
sigs_from_cursor = mongo_cursor()
sigs_from_cursor[0:5]

['',
 '2c/3a, fine texture, normal porosity, medium-to-thick\nCo wash: Vo5 Extra Body\nConditioners: Deva One Condition, GVP CB, Loreal Evercurl\nStylers: Devacurl ArcAngel, KCCC, AG Recoil, Iso Bouncy Creme, Ecostyler Krystal, CCCCL, KCKT\nLow Poo: Devacurl NoPoo, GTTT\nHair likes: plopping, diffusing, SMaster\'s, coconut and argan oil, honey, protein\n"What makes a woman unforgettable? Her mind...surrounded by lots of naturally curly hair"',
 'True 3B\ncoarse, overly porous, normal elasticity, thick\nOn the long transitioning road...\nDiscovered Curly: November 24, 2009:toothy3:\nHates: humectants, protein, rain, wind, humidity, plopping\nLoves: moisture, ecostyler, sallys conditioning balm\n\n:wav:\n\n\n\n\n\n\n\n\n',
 "Silence grows\nMy feelings flow\nI'm dreaming now\nOf all the things I know\n",
 "I've learned that no matter what happens, or how bad it seems today, life does go on, and it will be better tomorrow.\n\n3 b/c shoulder length hair. Black-brown color. It's always mista

In [5]:
# sort the signatures
raw_characteristics_df = sorted_signatures(sigs_from_cursor)
raw_characteristics_df.head()

Unnamed: 0,curl_pattern,density,porosity,texture,products
0,,,,,
1,2c,thick,normal,fine,to co wash vo5 extra body conditioners deva ...
2,3a,thick,normal,fine,to co wash vo5 extra body conditioners deva ...
3,3b,thick,,,true coarse overly porous normal elasticity on...
4,,thin,,,silence grows my feelings flow i m dreaming no...


In [6]:
cleaned_df = cleaned_signatures(raw_characteristics_df)
cleaned_df.head()

Unnamed: 0,curl_pattern,density,porosity,texture,products
1,2c,thick,normal,fine,to co wash vo5 extra body conditioners deva ...
2,3a,thick,normal,fine,to co wash vo5 extra body conditioners deva ...
3,3b,thick,,,true coarse overly porous normal elasticity on...
4,,thin,,,silence grows my feelings flow i m dreaming no...
9,4a,,,,toothy4 mix toothy4 bc 09 28 12 occasion9 gro...


In [12]:
# make a data fram with no nulls in the different characteristics
curl_pattern_no_nan_df = cleaned_df[pd.notnull(cleaned_df['curl_pattern'])]
print(curl_pattern_no_nan_df.describe())
curl_pattern_no_nan_df.head()

       curl_pattern density porosity texture products
count          7405    2497     2125    2231     7405
unique           10       3        3       3     4854
top              3a   thick   normal    fine         
freq           2019    1086      831    1656       56


Unnamed: 0,curl_pattern,density,porosity,texture,products
1,2c,thick,normal,fine,to co wash vo5 extra body conditioners deva ...
2,3a,thick,normal,fine,to co wash vo5 extra body conditioners deva ...
3,3b,thick,,,true coarse overly porous normal elasticity on...
9,4a,,,,toothy4 mix toothy4 bc 09 28 12 occasion9 gro...
10,3c,,,,toothy4 mix toothy4 bc 09 28 12 occasion9 gro...


In [19]:
cleaned_df['curl_catagory'] = pd.Categorical(cleaned_df.curl_pattern)
cleaned_df.dtypes


curl_pattern       object
density            object
porosity           object
texture            object
products           object
curl_catagory    category
dtype: object

In [21]:
cleaned_df["curl_catagory"] = cleaned_df["curl_catagory"].cat.codes
cleaned_df.head()

Unnamed: 0,curl_pattern,density,porosity,texture,products,curl_catagory
1,2c,thick,normal,fine,to co wash vo5 extra body conditioners deva ...,3
2,3a,thick,normal,fine,to co wash vo5 extra body conditioners deva ...,4
3,3b,thick,,,true coarse overly porous normal elasticity on...,5
4,,thin,,,silence grows my feelings flow i m dreaming no...,-1
9,4a,,,,toothy4 mix toothy4 bc 09 28 12 occasion9 gro...,7


In [23]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 4), stop_words='english')
features = tfidf.fit_transform(cleaned_df.products).toarray()
labels = cleaned_df.curl_catagory
features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
N = 2
for Product, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))