# DS9 Which Whiskey? 

In [122]:
import numpy as np
import pandas as pd
import re
import string

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from textblob import TextBlob
import spacy
from spacy.tokenizer import Tokenizer

In [2]:
whiskey = pd.read_csv('whisk-train.csv')
whisk_test=pd.read_csv('whisk-test.csv')

In [3]:
nlp = spacy.load("en_core_web_lg")

In [19]:
def preprocess(df, col):
    
       
    print('begin')
    df['word_count'] = df[col].apply(lambda x: len(x.split()))

    df['avg_word_len'] = df[col].apply(lambda x: pd
                                                .Series(x.split()) #list to series
                                                .str.len() # indvidual word length
                                                .sum() / len(x.split()))

    df['all_caps_freq'] = df[col].apply(lambda x: \
                                             len([x for x \
                                              in x.split() # list comprehension \ 
                                              if (x.isupper()) # including only all uppercase words
                                              & (len(x) > 2)]))

    df['exclamatories_count'] = df[col].apply(lambda x: # sentences ending with a !\
                                                len(re.findall(r'[\s\b\w]*[!]{1,10}', x)))


    df['interrogatives_count'] = df[col].apply(lambda x: # sentences ending with a ? \ 
                                                    len(re.findall(r'[\s\b\w]*[?]{1,10}', x)))

    # remove punctuation
    df[col] = df[col].str.replace('[^\w\s]','')

    #and force to lower
    df[col] = df[col].str.lower()

    # of stop words
    stop = nlp.Defaults.stop_words    
    df['stopwords'] = df[col].apply(lambda x: len([x for x in x.split() if x in stop]))
    
    # drop numerics
    df[col] = df[col].str.replace('[\d]','')
    
    

    return df

def pct_change(df, col):
    """
    stops when the pct change between descending word frequency removal is less than 1%
    """
    freq = get_word_freq(df, col)

    change = 1.1
    total = sum(freq.values)
    step = 1
    while change > .01:
        curr = freq[step]
        prior = total - sum(freq.values[:step])
        change = curr / prior

        step +=1

    curr = freq[step]
    prior = total - sum(freq.values[:step])
    change = curr / prior    

    common = list(freq[:20-1].index)
    rare = list((freq.loc[freq.values < 2]).index)

    df[col] = df[col].apply(lambda x:
                                    " ".join([x for x in x.split() if x not in common]))

    df[col] = df[col].apply(lambda x: 
                                 " ".join([x for x in x.split() if x not in rare]))

    return df




def get_word_freq(df, col):
        # word frequency
    return pd.Series(' '.join(df[col]).split()).value_counts()

def sent_analysis(df, col):
    
    df['sentiment'] = df[col].apply(lambda x: TextBlob(x).sentiment[0])
    df['polarity'] = df[col].apply(lambda x: TextBlob(x).sentiment[1])

    return df

    
def tokenize(df, col):
    
    tokenizer = Tokenizer(nlp.vocab)
    tokens = []

    for doc in tokenizer.pipe(df[col], batch_size=500):
        doc_tokens = []
        for token in doc:
            if (token.is_stop == False) & (token.is_punct == False):
                doc_tokens.append(token.text.lower())
        tokens.append(doc_tokens)
    
    # apply to df
    df['tokenized'] = tokens
    df['tokenized'] = df['tokenized'].apply(lambda x: ' '.join(x))

    return df

def get_lemmas(text):

    lemmas = []

    doc = nlp(text)

    # Something goes here :P
    for token in doc: # punctuation already removed
        if (token.is_stop==False) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)

    return lemmas

def lemmatize(df, col):

    df['lemmatized'] = df[col].apply(get_lemmas)

    df['lemmatized'] = df['lemmatized'].apply(lambda x: ' '.join(x))

    return df

def tdidf(df, col):
    
    vect_word = TfidfVectorizer(max_features=20000, lowercase=True,\
         analyzer='word', stop_words= 'english',ngram_range=(1,3),\
            dtype=np.float32)
    
    word_vect = vect_word.fit(df[col])
    
    return df, word_vect

def vec_PCA(df, vect, name):

    pca = PCA(n_components=100)
    
    vec_coords = pca.fit_transform(vect.todense())
    cols = []
    for i in range(0, 40):
        col = f'{name}_vec_PCA_{i}'
        cols.append(col)


    temp_df = pd.DataFrame(vec_coords, columns=cols)
    df = df.join(temp_df)
    
    return df

def k_means_vect(df, vect, name):

    kms = KMeans(
    n_clusters=12,
    max_iter=300,
    precompute_distances="auto",
    n_jobs=-1,
    verbose=True)

    labels = kms.fit(vect)

    temp_df = pd.DataFrame(labels, columns=[f'{name}_vec_kmeans'])

    df = df.join(temp_df)

    return df

def flag_uniques(df):
    
    df['cat1_flag'] = df['lemmatized'].apply(lambda x: len([x for x in x.split() if x in cat1_unique]))
    df['cat2_flag'] = df['lemmatized'].apply(lambda x: len([x for x in x.split() if x in cat2_unique]))
    df['cat3_flag'] = df['lemmatized'].apply(lambda x: len([x for x in x.split() if x in cat3_unique]))
    df['cat4_flag'] = df['lemmatized'].apply(lambda x: len([x for x in x.split() if x in cat4_unique]))
    
    return df

def flagged(df):    
        if df['cat1_flag'] > 0:
            df['flag'] = 1
        elif df['cat2_flag'] > 0:
            df['flag'] = 2
        elif df['cat3_flag'] > 0:
            df['flag'] = 3
        elif df['cat4_flag'] > 0:
            df['flag'] = 4
        else:
            df['flag'] = 0
            
        return df

In [48]:
def tfidf_vect(corpus):
    
    tfidf = TfidfVectorizer(max_features=40000, lowercase=True,\
     analyzer='word', stop_words= 'english',ngram_range=(1,3),\
        dtype=np.float32)

    t_vec = tfidf.fit_transform(corpus)
    
    return t_vec

def vec_PCA(df, vect, dims):

    pca = PCA(n_components=dims, random_state=42)    
    vec_coords = pca.fit_transform(vect.todense())
    cols = []
    
    for i in range(0, dims):
        col = f'vec_PCA_{i}'
        cols.append(col)


    temp_df = pd.DataFrame(vec_coords, columns=cols)
    df = df.join(temp_df)
    
    return df

def get_tsne(df, vect):
    tsne = sklearn.manifold.TSNE(n_components=2, perplexity=30.0,
                          early_exaggeration=12.0, learning_rate=200.0,
                          n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07,
                          metric='euclidean', init='random', verbose=0,
                          random_state=None, method='barnes_hut', angle=0.5)

    train_tsne = tsne.fit_transform(vect.todense())
    
    cols = ['tsne_0', 'tsne_1']
    
    temp_df = pd.DataFrame(train_tsne, columns=cols)    
    df = df.join(temp_df)
    
    return df
    

In [34]:
def wrangle_df(df, col):
    
    df = preprocess(df, col)
    
    df = sent_analysis(df, col)
    df = pct_change(df, col)
    df = lemmatize(df, col)
    
    return df    

In [51]:
train = pd.read_csv('whisk-train.csv')
test = pd.read_csv('whisk-test.csv')

In [52]:
train = wrangle_df(train, 'description')
test = wrangle_df(test, 'description')

begin
begin


In [53]:
train.head(1)

Unnamed: 0,id,description,category,word_count,avg_word_len,all_caps_freq,exclamatories_count,interrogatives_count,stopwords,sentiment,polarity,lemmatized
0,1,marriage year old bourbons mature yet very ele...,2,60,5.033333,0,1,0,23,0.265,0.555,marriage year old bourbon mature elegant whisk...


## Looking for differences among classes

In [54]:
cat_1 = train.loc[train.category == 1]
cat_2 = train.loc[train.category == 2]
cat_3 = train.loc[train.category == 3]
cat_4 = train.loc[train.category == 4]

len(cat_1), len(cat_2), len(cat_3), len(cat_4)

(1637, 449, 300, 200)

## Unique Words

In [55]:
cat1_words = get_word_freq(cat_1, 'lemmatized')
cat2_words = get_word_freq(cat_2, 'lemmatized')
cat3_words = get_word_freq(cat_3, 'lemmatized')
cat4_words = get_word_freq(cat_4, 'lemmatized')

In [56]:
word_freqs = pd.DataFrame((cat1_words, cat2_words, cat3_words, cat4_words)).T.reset_index()
word_freqs.columns = ['word','cat1','cat2','cat3','cat4']

In [57]:
cat1_unique = list(word_freqs.loc[(word_freqs['cat2'].isnull()) & (word_freqs['cat3'].isnull())\
                                  & (word_freqs['cat4'].isnull()) & (word_freqs['cat1'] > 2)]['word'].values)

cat2_unique = list(word_freqs.loc[(word_freqs['cat1'].isnull()) & (word_freqs['cat3'].isnull())\
                                  & (word_freqs['cat4'].isnull()) & (word_freqs['cat2'] > 2)]['word'].values)

cat3_unique = list(word_freqs.loc[(word_freqs['cat1'].isnull()) & (word_freqs['cat2'].isnull())\
                                  & (word_freqs['cat4'].isnull()) & (word_freqs['cat3'] > 2)]['word'].values)

cat4_unique = list(word_freqs.loc[(word_freqs['cat1'].isnull()) & (word_freqs['cat2'].isnull())\
                                  & (word_freqs['cat3'].isnull()) & (word_freqs['cat4'] > 2)]['word'].values)

In [58]:
len(cat1_unique), len(cat2_unique), len(cat3_unique), len(cat4_unique)

(630, 48, 45, 48)

In [59]:
train = flag_uniques(train)
test = flag_uniques(test)
train = train.apply(flagged, axis=1)
test = test.apply(flagged, axis=1)
train = train.drop(columns=['cat1_flag', 'cat2_flag', 'cat3_flag', 'cat4_flag'])
test = test.drop(columns=['cat1_flag', 'cat2_flag', 'cat3_flag', 'cat4_flag'])

In [60]:
train.head(1)

Unnamed: 0,id,description,category,word_count,avg_word_len,all_caps_freq,exclamatories_count,interrogatives_count,stopwords,sentiment,polarity,lemmatized,flag
0,1,marriage year old bourbons mature yet very ele...,2,60,5.033333,0,1,0,23,0.265,0.555,marriage year old bourbon mature elegant whisk...,0


In [61]:
tfidf = TfidfVectorizer(max_features=20000, lowercase=True,\
     analyzer='word', stop_words= 'english',ngram_range=(1,3),\
        dtype=np.float32)

t_vec = tfidf.fit(train['lemmatized'])

train_tfidf = t_vec.transform(train['description'])
test_tfidf = t_vec.transform(test['description'])

In [62]:
train = vec_PCA(train, train_tfidf, 100)
test = vec_PCA(test, test_tfidf, 100)


In [469]:
def get_mean_idf(text):
    
    l = []
    
    for word in text.split():
        try:
            w = df_idf.loc[df_idf.word == word]['idf_weights'].values[0]
            l.append(w)
        except Exception:
            w = 0
            l.append(w)
    return np.mean(l)
    

    

In [417]:
df_idf = pd.DataFrame(t_vec.idf_, index=t_vec.get_feature_names(),columns=["idf_weights"]).reset_index()
df_idf.columns = ['word', 'idf_weights']
df_idf.head()

Unnamed: 0,word,idf_weights
0,abc,7.759642
1,aberdeenshire,7.066494
2,aberdeenshire distillery,7.759642
3,aberfeldy,6.460359
4,aberfeldy run,7.759642


In [470]:
train['mean_idf'] = train['lemmatized'].apply(get_mean_idf)

In [473]:
test['mean_idf'] = test['lemmatized'].apply(get_mean_idf)

In [476]:
tsne = sklearn.manifold.TSNE(n_components=2, perplexity=30.0,
                      early_exaggeration=12.0, learning_rate=200.0,
                      n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07,
                      metric='euclidean', init='random', verbose=0,
                      random_state=None, method='barnes_hut', angle=0.5)

train_tsne = tsne.fit_transform(train_tfidf.toarray())
test_tsne = tsne.fit_transform(test_tfidf.toarray())

In [479]:
cols = ['tsne_0', 'tsne_1']
    
# temp_df1 = pd.DataFrame(train_tsne, columns=cols)
# temp_df2 = pd.DataFrame(test_tsne, columns=cols)

train = train.join(temp_df1)
test = test.join(temp_df2)

In [107]:
tokenize = vect.build_tokenizer()
stop_words = vect.get_stop_words()

In [123]:
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.decomposition import TruncatedSVD


parameters = {
    'lsi__svd__n_components': [200 ,225],
    'lsi__vect__max_df':[.9, .99],
    'lsi__vect__min_df':[3, 5],
    'lsi__vect__stop_words': ['english'],
    'lsi__vect__tokenizer': [None],
    'clf__C':[0.8, 1]
}


tsne = sklearn.manifold.TSNE(n_components=2, perplexity=30.0,
                      early_exaggeration=12.0, learning_rate=200.0,
                      n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07,
                      metric='euclidean', init='random', verbose=0,
                      random_state=None, method='barnes_hut', angle=0.5)




svd = TruncatedSVD(algorithm='randomized',
                   n_iter=10,)
vect = TfidfVectorizer(ngram_range=(1,3))
lsi = Pipeline([('vect', vect), ('svd', svd), ('tsne', tsne)])


clf = LinearSVC(loss='squared_hinge', max_iter=200)
pipe = Pipeline([('lsi', lsi), ('clf', clf)])

svc_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=10)
svc_search.fit(whiskey['description'],whiskey['category'])

TypeError: All intermediate steps should be transformers and implement fit and transform. 'Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
...nts=2, n_iter=1000, n_iter_without_progress=300,
   perplexity=30.0, random_state=None, verbose=0))])' (type <class 'sklearn.pipeline.Pipeline'>) doesn't

In [88]:
X.shape

(2586, 112)

In [111]:
svc_search.best_params_

{'clf__C': 1,
 'lsi__svd__n_components': 225,
 'lsi__vect__max_df': 0.9,
 'lsi__vect__min_df': 3,
 'lsi__vect__stop_words': 'english',
 'lsi__vect__tokenizer': <function sklearn.feature_extraction.text.VectorizerMixin.build_tokenizer.<locals>.<lambda>(doc)>}

In [114]:
svc_search.predict(whisk_test['description'])

array([2, 3, 4, 1, 1, 1, 1, 1, 2, 1, 4, 4, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 4, 1, 1, 1, 3, 1, 4, 2, 1, 1, 1, 1, 1, 3, 4, 3, 2, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 4,
       2, 1, 1, 1, 1, 3, 1, 1, 4, 1, 3, 2, 1, 1, 4, 2, 2, 1, 1, 3, 2, 4,
       1, 3, 1, 1, 1, 1, 1, 4, 1, 1, 4, 3, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2,
       3, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 2, 2, 4, 1, 1,
       1, 1, 3, 2, 1, 1, 1, 1, 1, 3, 2, 1, 1, 3, 4, 1, 1, 1, 3, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 1, 1, 4, 1, 1, 1, 3, 1, 2, 2, 1, 3, 3, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 4, 1, 3, 1, 4, 1, 1, 2, 2, 1, 1,
       2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 4, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1,
       1, 4, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 3,
       2, 2, 1, 3, 1, 3, 3, 3, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 3, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 4, 1, 1, 1, 3,
       2, 1], dtype=int64)

In [118]:
preds_lin_svc = svc_search.predict(whisk_test['description'])

In [555]:
preds_rfc = grid_search.predict(whisk_test['description'])

In [103]:
target = 'category'

features = train.columns.to_list()
features.remove(target)
features.remove('lemmatized')
features.remove('description')

X = train[features]
y = train[[target]]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42,
                                                    test_size=0.3)

##### column transformer

In [74]:
from sklearn.compose import ColumnTransformer

In [79]:
ct = ColumnTransformer(
    [('lemmatized_tfidf', TfidfVectorizer(max_features = 20000, stop_words = 'english', ngram_range=(1,3)), 'lemmatized'),
#     ('tokenized_tfidf', TfidfVectorizer(max_features = 20000, stop_words = 'english', ngram_range=(1,3)), 'tokenized'),
    ('description_tfidf', TfidfVectorizer(max_features = 20000, stop_words = 'english', ngram_range=(1,3)), 'description')
    ],
    remainder='passthrough')

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

models = [LogisticRegression(solver='lbfgs', max_iter=1000),
          DecisionTreeClassifier(max_depth=3),
          DecisionTreeClassifier(max_depth=None),
          RandomForestClassifier(max_depth=3, n_estimators=50, n_jobs=-1, random_state=42),
          RandomForestClassifier(max_depth=None, n_estimators=50, n_jobs=-1, random_state=42),
          XGBClassifier(max_depth=3, n_estimators=50, n_jobs=-1, random_state=42)
         ]

for model in models:
    print(model, '\n')
    score = cross_val_score(model, X, y, scoring='accuracy', cv=5).mean()
    print('Cross_Validation Accuracy:', score, '\n', '\n')

In [None]:
tuned_parameters = {'C': [0.1, 0.5, 1, 5, 10, 50, 100],
                   'solver': ['lbfgs', 'liblinear']}
log_search = GridSearchCV(LogisticRegression(), tuned_parameters, cv=3, scoring="accuracy")
log_search.fit(X, y)

In [542]:
test2 = test.drop(columns='lemmatized')

In [545]:
log_preds = log_search.predict(test2)

In [None]:
master = pd.DataFrame({'id': whisk_test['id'], 'category':preds_lin_svc})
for i in range(0, 5):
    preds = grid_search.predict(whisk_test['description'])
    p = pd.DataFrame({f'pred_{i}': preds})
    
    master = master.join(p)
master

In [740]:
preds

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [753]:
print('accuracy %s' % accuracy_score(preds, whiskey['category']))
print(classification_report(whiskey['category'], preds))

accuracy 0.9617169373549884
              precision    recall  f1-score   support

           1       0.97      1.00      0.99      1637
           2       0.95      0.91      0.93       449
           3       0.89      0.89      0.89       300
           4       0.99      0.90      0.94       200

   micro avg       0.96      0.96      0.96      2586
   macro avg       0.95      0.92      0.94      2586
weighted avg       0.96      0.96      0.96      2586



In [718]:
lin_svc_1 = pd.DataFrame({'id': whisk_test['id'], 'category':preds_lin_svc})
# preds_rfc = pd.DataFrame({'id': whisk_test['id'], 'category':preds_rfc})
# pred_log = pd.DataFrame({'id': whisk_test['id'], 'category':log_preds})


In [None]:
linv

In [692]:
flag_ids = test[['cat1_flag', 'cat2_flag', 'cat3_flag', 'cat4_flag']]

In [642]:
def flagged(df):    
        if df['cat1_flag'] > 0:
            df['flag'] = 1
        elif df['cat2_flag'] > 0:
            df['flag'] = 2
        elif df['cat3_flag'] > 0:
            df['flag'] = 3
        elif df['cat4_flag'] > 0:
            df['flag'] = 4
        else:
            df['flag'] = 0
            
        return df

In [694]:
flag_ids = flag_ids.apply(flagged, axis=1)

In [None]:
pd.DataFrame(pred_l)

In [599]:
preds = preds.join(pd.DataFrame(pred_l))

In [None]:
preds = pd.DataFrame({'id': test['id'], 'rfc': preds_rfc, 'svc': preds_lin_svc, 'log': log_preds})
preds

In [697]:
preds = preds.join(flag_ids)

In [None]:
(preds.loc[(preds['flag'] != preds['svc']) & (preds['flag'] != 0) ])


In [683]:
def imputed(df):
    
        df['category'] = df['svc'] 
        
        if df['flag'] > 0:
            df['category'] = df['flag']
        else:
            df['category'] = df['svc']            
            
        return df

preds['category'] = 0

In [684]:
preds = preds.apply(imputed, axis=1)

In [116]:
preds

NameError: name 'preds' is not defined

In [119]:
lin_svc_1 = pd.DataFrame({'id': whisk_test['id'], 'category':preds_lin_svc})

In [120]:
lin_svc_1.to_csv('linsvc3.csv', index=False)