In [48]:
# Import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pymongo
import pickle
import string
import nltk
import re
import os
import sys
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
import warnings
warnings.filterwarnings('ignore')



src_dir = os.path.join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

# helper functions
from d02_processing.cleaning_signatures import sorted_signatures
from d02_processing.cleaning_signatures import cleaned_signatures
from d01_utils.mongo_cursor_creator import mongo_cursor

# Load the "autoreload" extension
%load_ext autoreload

# reload modules so that as you change code in src, it gets loaded
%autoreload

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load relevant data

In [3]:
df = pd.read_csv('../../data/02_intermediate/no_nan_curl_pattern_df.csv')
df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,curl_pattern,density,porosity,texture,products,curl_catagory
0,2b,,,coarse,poo deva nopoo co wash suave naturals rinse ou...,2
1,3b,thick,,,since feb 09 currently using low poo tj s nour...,5
2,4a,,,,deeplyenrichedoils com show me some love check...,7
3,3a,,,,hair type and lots of it hair color ranges fro...,4
4,3b,,,,hair type and lots of it hair color ranges fro...,5


In [25]:
with open('../../data/02_intermediate/Ulta_products.pkl', 'rb') as f:
    ulta = pickle.load(f)
ulta[:5]

['Redken All Soft Conditioner',
 'Paul Mitchell Tea Tree Special Shampoo',
 'Redken All Soft Shampoo',
 'Matrix Biolage Colorlast Conditioner',
 'Matrix Biolage Colorlast Shampoo']

In [28]:
with open('../../data/02_intermediate/Giovanni_products.pkl', 'rb') as f:
    giovanni = pickle.load(f)
giovanni[:5]

['giovanni tea tree triple treat invigorating shampoo ',
 'giovanni tea tree triple treat invigorating conditioner ',
 'giovanni smooth as silk deep moisture shampoo ',
 'giovanni smooth as silk deeper moisture conditioner ',
 'giovanni 5050 balanced hydratingclarifying shampoo ']

In [29]:
with open('../../data/02_intermediate/jessicurl_products_full_name.pkl', 'rb') as f:
    jessicurl = pickle.load(f)
jessicurl[:5]

['jessicurl gentle lather shampoo',
 'jessicurl hair cleansing cream',
 'jessicurl aloeba daily conditioner',
 'jessicurl deep treatment',
 'jessicurl too shea extra moisturizing conditioner']

In [164]:
with open('../../data/02_intermediate/kinky_curly_products.pkl', 'rb') as f:
    kinky_curly = pickle.load(f)
kinky_curly[:5]

['kinky curly seriously smooth prep protect',
 'kinky curly seriously smooth swift set lotion',
 'kinky curly seriously smooth fast dry foam',
 'kinky curly knot today',
 'kinky curly curling custard']

In [165]:
with open('../../data/02_intermediate/other_products.pkl', 'rb') as f:
    other_products = pickle.load(f)
other_products[:5]

['LA Looks Sport Look Gel',
 'TRESemmé Botanique Nourish and Replenish Conditioner',
 'As I Am Coconut Co-wash',
 'Yes to Carrots Nourishing Shampoo',
 'LOreal EverCreme Cleansing Conditioner']

In [166]:
product_list_full = ulta + giovanni + jessicurl + kinky_curly + other_products
product_list_full

['Redken All Soft Conditioner',
 'Paul Mitchell Tea Tree Special Shampoo',
 'Redken All Soft Shampoo',
 'Matrix Biolage Colorlast Conditioner',
 'Matrix Biolage Colorlast Shampoo',
 'Paul Mitchell Tea Tree Lavender Mint Moisturizing Conditioner',
 'Paul Mitchell Tea Tree Lavender Mint Moisturizing Shampoo',
 'Matrix Biolage Ultra Hydrasource Conditioner',
 'Redken Color Extend Magnetics Shampoo',
 'Pureology Hydrate Conditioner',
 'Pureology Hydrate Shampoo',
 'Redken Color Extend Blondage Color Depositing Purple Shampoo',
 'Redken Color Extend Shampoo',
 'Redken Color Extend Magnetics Conditioner',
 'Redken Extreme Conditioner',
 'Joico Color Balance Purple Shampoo',
 'Redken Color Extend Conditioner',
 'Paul Mitchell Tea Tree Special Conditioner',
 'Redken Extreme Shampoo',
 'AG Hair Moisture Fast Food Leave-On Conditioner',
 'Redken Color Extend Blondage Color Depositing Purple Conditioner',
 'Matrix Biolage Hydrasource Shampoo',
 "It's A 10 Miracle Deep Conditioner Plus Keratin",
 

In [167]:
# Clean up the products for puntation and characters

clean_no_acronymn_fix_products = []

for product in product_list_full:
    product = product.lower()
    product = re.sub(r"\–.*", " ", product)
    product = re.sub(r"\'" , '', product)
    product = re.sub(r'[^a-zA-Z0-9\s]', '', product)
    product = re.sub(r"\,.*", " ", product)
    product = re.sub(r"(leave in)", " ", product)
    product = re.sub(r"(rinse out)", " ", product)
    product = re.sub(r"(conditioner)", " ", product)
    product = re.sub(r"(gel)", " ", product)
    product = re.sub(r"(condition)", " ", product)
    product = re.sub(r"(shampoo)", " ", product)
    #product = re.sub(r"(poo)", " ", product)
    product = re.sub(r"(product)", " ", product)
    product = re.sub(r"(styler)", " ", product)
    product = re.sub(r"(creme)", " ", product)
    product = re.sub(r"(cream)", " ", product)
    product = re.sub(r"(spray)", " ", product)
    product = re.sub(r"(custard)", " ", product)
    product = re.sub('  ', ' ', product) 
    
    
    clean_no_acronymn_fix_products.append(product)

In [168]:
product_list_full[0]

'Redken All Soft Conditioner'

In [169]:
clean_no_acronymn_fix_products

['redken all soft ',
 'paul mitchell tea tree special ',
 'redken all soft ',
 'matrix biolage colorlast ',
 'matrix biolage colorlast ',
 'paul mitchell tea tree lavender mint moisturizing ',
 'paul mitchell tea tree lavender mint moisturizing ',
 'matrix biolage ultra hydrasource ',
 'redken color extend magnetics ',
 'pureology hydrate ',
 'pureology hydrate ',
 'redken color extend blondage color depositing purple ',
 'redken color extend ',
 'redken color extend magnetics ',
 'redken extreme ',
 'joico color balance purple ',
 'redken color extend ',
 'paul mitchell tea tree special ',
 'redken extreme ',
 'ag hair moisture fast food leaveon ',
 'redken color extend blondage color depositing purple ',
 'matrix biolage hydrasource ',
 'its a 10 miracle deep  plus keratin',
 'ag hair jumbo pump',
 'redken frizz dismiss sufatefree ',
 'joico color balance purple ',
 'matrix biolage ultra hydrasource ',
 'redken all soft mega ',
 'redken frizz dismiss sulfatefree ',
 'matrix biolage a

---

### Get rid of any product catagory items that do not have relevant words

In [157]:
required_product_words = ['shampoo', 'conditioner', 'poo', 'lopoo', 'nopoo', 'plop', 'diffuse', 'condish', 'STC',
                         'conditioner', 'gel', 'product', 'styler', 'LO', 'Leave in', 'RO']

In [158]:
searchfor = required_product_words
relevant_products_column_df = df[df['products'].str.contains('hair' or 'shampoo' or 'conditioner' or 'poo' or 'lopoo' or 'nopoo' or 'plop' or 'diffuse' or 'condish' or 'STC' or
                         'conditioner' or 'gel' or 'product' or 'styler' or 'LO' or 'Leave in' or 'RO', na =False)] 
relevant_products_column_df.reset_index(drop=True, inplace=True)
relevant_products_column_df.head(10)

Unnamed: 0,curl_pattern,density,porosity,texture,products,curl_catagory
0,2b,,,coarse,poo deva nopoo co wash suave naturals rinse ou...,2
1,4a,,,,deeplyenrichedoils com show me some love check...,7
2,3a,,,,hair type and lots of it hair color ranges fro...,4
3,3b,,,,hair type and lots of it hair color ranges fro...,5
4,2c,thick,high,fine,mostly with some and a very strange curl wave...,3
5,3a,thick,high,fine,mostly with some and a very strange curl wave...,4
6,4a,,,,demcalicurls hair type big chop june 2006 rel...,7
7,2b,thin,,,former then modified now too lazy p type i k a...,2
8,3a,thin,,,former then modified now too lazy p type i k a...,4
9,2b,,high,fine,very hair low poo living proof restore shampo...,2


In [159]:
# Just for Funsies check how many relevant full signatures we have'
# 339!!! Not a lot
full_sets_only_df = relevant_products_column_df.dropna()
print(full_sets_only_df.shape)
full_sets_only_df.head()

(339, 6)


Unnamed: 0,curl_pattern,density,porosity,texture,products,curl_catagory
4,2c,thick,high,fine,mostly with some and a very strange curl wave...,3
5,3a,thick,high,fine,mostly with some and a very strange curl wave...,4
11,2c,medium,high,fine,wavy fii on the fia system current products l...,3
14,2a,medium,low,fine,still working on a solid hair care routine,1
16,3b,thick,low,coarse,my hair strands virgin hair armpit length cu...,5


### Psuedocode plan 
#### Do steps 1-4 in this notebook


1. Assume only relevant words are now in products.

2. deal with acronymns in products now.
      get a list of name brands and extract common names from in front of products like Kinky Curly CC or KCCC both correct to Kinky Curly Curling Custard

3. create 1-6 ngrams from each (completly cleaned of acronymns and for relevance) products row and then cycle through those and compare to a list of products do fuzzy matching and if the match is above X percent assign that product to a new list which will become a new column. Asign nan to ones where nothing comes up at all'

4. Inspect new products only catagory to make sure everythign looks like products

5. Perform TFIDF and run the models again looking at all scoring metrics

6. If time correct improperly balanced classes with up or downsampling and run again

---

### Step 2
#### Deal with acronymns

In [44]:
acronymn_dict = {'jessicurl': 'jc',
                'kinky curly curling custard': ['KCCC', 'KC curling custard', 'kinky curly cc', 'kc custard', 'kinky curly custard'],
                'la Looks Sport Look Gel': ['la looks', 'la looks gel', 'la looks sport gel', 'la look sports gel'],
                'kinky curly knot today': ['kc knot today', 'kinky curly kt', 'kckt', 'kinky curly not today', 'kc not today', 'kcnt', 'kinky curly nt'],
                'as i am coconut cowash': ['as i am coconut co-wash', 'as i am coconut co wash', 'asiam coconut cowash', 'asiam coconut co wash', 'asiam cowash', 'asiam cowash'],
                'herbal essence totally twisted mousse': ['he tt', 'hett', 'he totally twisted', 'herbal essence tt', 'he mousse', 'he moose', 'herbal essence moose', 'herbal essence mousse'],
                'devacurl': 'deva'
                }

In [37]:
# Check on frequency of n-grams

def generate_ngrams(s, n):
    '''
    s- the string you want to parse into ngrams
    n - the number of n for each ngrams
    '''
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [40]:
# all quimtagrams in the product set

result = [generate_ngrams(x, 5) for x in relevant_products_column_df['products']]
result

[['poo deva nopoo co wash',
  'deva nopoo co wash suave',
  'nopoo co wash suave naturals',
  'co wash suave naturals rinse',
  'wash suave naturals rinse out',
  'suave naturals rinse out leave',
  'naturals rinse out leave in',
  'rinse out leave in deva',
  'out leave in deva one',
  'leave in deva one condition',
  'in deva one condition products',
  'deva one condition products deva',
  'one condition products deva arcangell',
  'condition products deva arcangell deva',
  'products deva arcangell deva mister',
  'deva arcangell deva mister right',
  'arcangell deva mister right c',
  'deva mister right c overly',
  'mister right c overly porous',
  'right c overly porous hair'],
 ['deeplyenrichedoils com show me some',
  'com show me some love',
  'show me some love check',
  'me some love check out',
  'some love check out my',
  'love check out my website',
  'check out my website your',
  'out my website your vision',
  'my website your vision will',
  'website your vision will

In [41]:
result[0][2]

'nopoo co wash suave naturals'

In [43]:
# look and see whats coming up the most

quadgram_list_of_lists = [generate_ngrams(x, 4) for x in relevant_products_column_df['products']]

quadragrams = []
for single_list in quadgram_list_of_lists:
    for x in single_list:
        quadragrams.append(x)
        
quadragram_df = pd.DataFrame({'quadragrams': quadragrams})
test = dict(quadragram_df.quadragrams.value_counts())
test

{'co wash suave naturals': 86,
 'kinky curly knot today': 57,
 'wash suave naturals coconut': 46,
 'giovanni direct leave in': 40,
 'kinky curly curling custard': 36,
 'herbal essences totally twisted': 35,
 'suave naturals tropical coconut': 32,
 'la looks sport gel': 31,
 'giovanni smooth as silk': 27,
 'shea moisture curl enhancing': 26,
 'but lots of it': 24,
 'my hair loves protein': 23,
 'trying to figure out': 22,
 'moisture curl enhancing smoothie': 22,
 'rinse out leave in': 21,
 'as i am coconut': 21,
 'leave in suave naturals': 21,
 'leave in giovanni direct': 21,
 'hair type i k': 20,
 'still trying to figure': 19,
 'curls in a bottle': 18,
 'trader joe s tea': 18,
 'of the curly crusaders': 18,
 'trader joe s nourish': 18,
 'hair solutions curl keeper': 18,
 'order of the curly': 18,
 'leave in kinky curly': 18,
 'joe s tea tree': 18,
 'suave naturals coconut conditioner': 18,
 'in kinky curly knot': 17,
 'curly hair solutions curl': 17,
 'wash as i am': 17,
 'co wash tres

In [45]:
def fix_acronymns(product_string, n):
    '''
    product_string -  a string of potential products and other texts
    n - number to set the ngram too
    '''
    for i in range(n_min,n_max):
    
        ngram_strings = generate_ngrams(product_string, i)
        len_product_ngrams = len(ngram_strings)

        for x in range(len_product_ngrams):
        
            for official_product in product_list:   

                #ratio = fuzz.partial_ratio(ngram_strings[x], official_product)
                ratio = fuzz.token_set_ratio(ngram_strings[x], official_product)
                if ratio >= 98:
                    official_product_list.append(official_product)
                    ratio_list.append(ratio)
                #else:
                    #print(f'{ngram_strings[x]} and {official_product} had a ratio of {ratio}')
    
    return (official_product_list +ratio_list)

In [47]:
fix_acronymns(relevant_products_column_df.products[0], 4)

['poo deva nopoo co',
 'deva nopoo co wash',
 'nopoo co wash suave',
 'co wash suave naturals',
 'wash suave naturals rinse',
 'suave naturals rinse out',
 'naturals rinse out leave',
 'rinse out leave in',
 'out leave in deva',
 'leave in deva one',
 'in deva one condition',
 'deva one condition products',
 'one condition products deva',
 'condition products deva arcangell',
 'products deva arcangell deva',
 'deva arcangell deva mister',
 'arcangell deva mister right',
 'deva mister right c',
 'mister right c overly',
 'right c overly porous',
 'c overly porous hair']

### Find the products and extract them

In [193]:
def find_products(product_string, product_list, n_max=6, n_min=1):
    '''
    product_string -  a string of potential products and other texts
    product_list - a list with each element being an official product
    n - number to set the max ngram too
    '''
    official_product_list = []
    ratio_list = []
    for i in range(n_min,n_max):
    
        ngram_strings = generate_ngrams(product_string, i)
        len_product_ngrams = len(ngram_strings)

        for x in range(len_product_ngrams):
        
            for official_product in product_list:   

                #ratio = fuzz.partial_ratio(ngram_strings[x], official_product)
                ratio = fuzz.token_set_ratio(ngram_strings[x], official_product)
                if ratio >= 93 and ratio <= 99:
                    official_product_list.append(official_product)
                    ratio_list.append(ratio)
                #else:
                    #print(f'{ngram_strings[x]} and {official_product} had a ratio of {ratio}')
    
    return (official_product_list +ratio_list)

In [194]:
find_products(relevant_products_column_df.products[9], clean_no_acronymn_fix_products, n_max=5, n_min=2)

['living proof perfect hair day phd 5in1 styling treatment',
 'living proof no frizz nourishing styling ',
 'living proof perfect hair day heat styling ',
 'living proof no frizz weightless styling ',
 'living proof travel size perfect hair day phd 5in1 styling treatment',
 93,
 93,
 93,
 93,
 93]

In [191]:
relevant_products_column_df.products[9]

' very hair low poo living proof restore shampoo rinse out living proof restore conditioner leave in living proof curl leave in or garnier 1 minute mask papaya treatments restore repair leave in \xa0 styling \xa0living proof in shower styler with ouidad advanced climate control gel stronger hold'

In [None]:
# Make sure to get rid of travel stuff