In [22]:
# Import packages
import fuzzywuzzy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pymongo
import pickle
import string
import nltk
import re
import os
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
import warnings
warnings.filterwarnings('ignore')



src_dir = os.path.join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

# helper functions
from d02_processing.cleaning_signatures import sorted_signatures
from d02_processing.cleaning_signatures import cleaned_signatures
from d01_utils.mongo_cursor_creator import mongo_cursor

# Load the "autoreload" extension
%load_ext autoreload

# reload modules so that as you change code in src, it gets loaded
%autoreload

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load relevant data

In [3]:
df = pd.read_csv('../../data/02_intermediate/no_nan_curl_pattern_df.csv')
df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,curl_pattern,density,porosity,texture,products,curl_catagory
0,2b,,,coarse,poo deva nopoo co wash suave naturals rinse ou...,2
1,3b,thick,,,since feb 09 currently using low poo tj s nour...,5
2,4a,,,,deeplyenrichedoils com show me some love check...,7
3,3a,,,,hair type and lots of it hair color ranges fro...,4
4,3b,,,,hair type and lots of it hair color ranges fro...,5


In [25]:
with open('../../data/02_intermediate/Ulta_products.pkl', 'rb') as f:
    ulta = pickle.load(f)
ulta[:5]

['Redken All Soft Conditioner',
 'Paul Mitchell Tea Tree Special Shampoo',
 'Redken All Soft Shampoo',
 'Matrix Biolage Colorlast Conditioner',
 'Matrix Biolage Colorlast Shampoo']

In [28]:
with open('../../data/02_intermediate/Giovanni_products.pkl', 'rb') as f:
    giovanni = pickle.load(f)
giovanni[:5]

['giovanni tea tree triple treat invigorating shampoo ',
 'giovanni tea tree triple treat invigorating conditioner ',
 'giovanni smooth as silk deep moisture shampoo ',
 'giovanni smooth as silk deeper moisture conditioner ',
 'giovanni 5050 balanced hydratingclarifying shampoo ']

In [29]:
with open('../../data/02_intermediate/jessicurl_products_full_name.pkl', 'rb') as f:
    jessicurl = pickle.load(f)
jessicurl[:5]

['jessicurl gentle lather shampoo',
 'jessicurl hair cleansing cream',
 'jessicurl aloeba daily conditioner',
 'jessicurl deep treatment',
 'jessicurl too shea extra moisturizing conditioner']

In [30]:
with open('../../data/02_intermediate/kinky_curly_products.pkl', 'rb') as f:
    kinky_curly = pickle.load(f)
kinky_curly[:5]

['kinky curly seriously smooth prep protect',
 'kinky curly seriously smooth swift set lotion',
 'kinky curly seriously smooth fast dry foam',
 'kinky curly knot today',
 'kinky curly curling custard']

In [31]:
with open('../../data/02_intermediate/other_products.pkl', 'rb') as f:
    other_products = pickle.load(f)
other_products[:5]

['LA Looks Sport Look Gel',
 'TRESemmé Botanique Nourish and Replenish Conditioner',
 'As I Am Coconut Co-wash',
 'Yes to Carrots Nourishing Shampoo',
 'LOreal EverCreme Cleansing Conditioner']

In [32]:
product_list_full = ulta + giovanni + jessicurl + kinky_curly + other_products
product_list_full

['Redken All Soft Conditioner',
 'Paul Mitchell Tea Tree Special Shampoo',
 'Redken All Soft Shampoo',
 'Matrix Biolage Colorlast Conditioner',
 'Matrix Biolage Colorlast Shampoo',
 'Paul Mitchell Tea Tree Lavender Mint Moisturizing Conditioner',
 'Paul Mitchell Tea Tree Lavender Mint Moisturizing Shampoo',
 'Matrix Biolage Ultra Hydrasource Conditioner',
 'Redken Color Extend Magnetics Shampoo',
 'Pureology Hydrate Conditioner',
 'Pureology Hydrate Shampoo',
 'Redken Color Extend Blondage Color Depositing Purple Shampoo',
 'Redken Color Extend Shampoo',
 'Redken Color Extend Magnetics Conditioner',
 'Redken Extreme Conditioner',
 'Joico Color Balance Purple Shampoo',
 'Redken Color Extend Conditioner',
 'Paul Mitchell Tea Tree Special Conditioner',
 'Redken Extreme Shampoo',
 'AG Hair Moisture Fast Food Leave-On Conditioner',
 'Redken Color Extend Blondage Color Depositing Purple Conditioner',
 'Matrix Biolage Hydrasource Shampoo',
 "It's A 10 Miracle Deep Conditioner Plus Keratin",
 

In [33]:
# Clean up the products for puntation and characters

clean_no_acronymn_fix_products = []

for product in product_list_full:
    product = re.sub(r"\–.*", " ", product)
    product = re.sub("r\'" , '', product)
    product = re.sub(r'[^a-zA-Z0-9\s]', '', product)
    product = re.sub(r"\,.*", " ", product)
    product = re.sub('  ', ' ', product)
    product = product.lower()
    
    clean_no_acronymn_fix_products.append(product)

In [34]:
clean_no_acronymn_fix_products

['redken all soft conditioner',
 'paul mitchell tea tree special shampoo',
 'redken all soft shampoo',
 'matrix biolage colorlast conditioner',
 'matrix biolage colorlast shampoo',
 'paul mitchell tea tree lavender mint moisturizing conditioner',
 'paul mitchell tea tree lavender mint moisturizing shampoo',
 'matrix biolage ultra hydrasource conditioner',
 'redken color extend magnetics shampoo',
 'pureology hydrate conditioner',
 'pureology hydrate shampoo',
 'redken color extend blondage color depositing purple shampoo',
 'redken color extend shampoo',
 'redken color extend magnetics conditioner',
 'redken extreme conditioner',
 'joico color balance purple shampoo',
 'redken color extend conditioner',
 'paul mitchell tea tree special conditioner',
 'redken extreme shampoo',
 'ag hair moisture fast food leaveon conditioner',
 'redken color extend blondage color depositing purple conditioner',
 'matrix biolage hydrasource shampoo',
 'its a 10 miracle deep conditioner plus keratin',
 'a

---

### Get rid of any product catagory items that do not have relevant words

In [16]:
required_product_words = ['shampoo', 'conditioner', 'poo', 'lopoo', 'nopoo', 'plop', 'diffuse', 'condish', 'STC',
                         'conditioner', 'gel', 'product', 'styler', 'LO', 'Leave in', 'RO']

In [19]:
searchfor = required_product_words
relevant_products_column_df = df[df['products'].str.contains('hair' or 'shampoo' or 'conditioner' or 'poo' or 'lopoo' or 'nopoo' or 'plop' or 'diffuse' or 'condish' or 'STC' or
                         'conditioner' or 'gel' or 'product' or 'styler' or 'LO' or 'Leave in' or 'RO', na =False)] 
relevant_products_column_df.head(10)

Unnamed: 0,curl_pattern,density,porosity,texture,products,curl_catagory
0,2b,,,coarse,poo deva nopoo co wash suave naturals rinse ou...,2
2,4a,,,,deeplyenrichedoils com show me some love check...,7
3,3a,,,,hair type and lots of it hair color ranges fro...,4
4,3b,,,,hair type and lots of it hair color ranges fro...,5
7,2c,thick,high,fine,mostly with some and a very strange curl wave...,3
8,3a,thick,high,fine,mostly with some and a very strange curl wave...,4
11,4a,,,,demcalicurls hair type big chop june 2006 rel...,7
14,2b,thin,,,former then modified now too lazy p type i k a...,2
15,3a,thin,,,former then modified now too lazy p type i k a...,4
21,2b,,high,fine,very hair low poo living proof restore shampo...,2


In [20]:
# Just for Funsies check how many relevant full signatures we have'
# 339!!! Not a lot
full_sets_only_df = relevant_products_column_df.dropna()
print(full_sets_only_df.shape)
full_sets_only_df.head()

(339, 6)


Unnamed: 0,curl_pattern,density,porosity,texture,products,curl_catagory
7,2c,thick,high,fine,mostly with some and a very strange curl wave...,3
8,3a,thick,high,fine,mostly with some and a very strange curl wave...,4
24,2c,medium,high,fine,wavy fii on the fia system current products l...,3
27,2a,medium,low,fine,still working on a solid hair care routine,1
34,3b,thick,low,coarse,my hair strands virgin hair armpit length cu...,5


### Psuedocode plan 
#### Do steps 1-4 in this notebook


1. Assume only relevant words are now in products.

2. deal with acronymns in products now.
      get a list of name brands and extract common names from in front of products like Kinky Curly CC or KCCC both correct to Kinky Curly Curling Custard

3. create 1-6 ngrams from each (completly cleaned of acronymns and for relevance) products row and then cycle through those and compare to a list of products do fuzzy matching and if the match is above X percent assign that product to a new list which will become a new column. Asign nan to ones where nothing comes up at all'

4. Inspect new products only catagory to make sure everythign looks like products

5. Perform TFIDF and run the models again looking at all scoring metrics

6. If time correct improperly balanced classes with up or downsampling and run again

---

### Step 2
#### Deal with acronymns

In [None]:
acronymn_dict = {'jessicurl': 'jc',
                'kinky curly curling custard': ['KCCC', 'KC curling custard', 'kinky curly cc', 'kc custard', 'kinky curly custard'],
                'LA Looks Sport Look Gel': ['la looks', 'la looks gel', 'la looks sport gel', 'la look sports gel'],
                'kinky curly knot today': ['kc knot today', 'kinky curly kt', 'kckt', 'kinky curly not today', 'kc not today', 'kcnt', 'kinky curly nt'],
                'As I Am Coconut Cowash': ['As I Am Coconut Co-wash', 'As I Am Coconut Co wash', 'AsIAm Coconut Cowash', 'AsIAm Coconut Co wash', 'asiam cowash', 'asiam cowash'],
                'Herbal Essence Totally Twisted Mousse': ['HE TT', 'HETT', 'HE totally twisted', 'Herbal essence tt', 'HE mousse', 'HE moose', 'herbal essence moose', 'herbal essence mousse']
                
                }