In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import WordNetLemmatizer

from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth


Lets try and do a few things
- are different countries described differently?
    - how about different regions?
- rating by region
- common words by variety
- bucket points into bands of ten, most common words in each bucket
- most common words by price
- most common words low value (points/price < low)
- most common words high value
- map of top regions
- map of most expensive regions


EXTRA - 
can I generate a fake review, region and winery for NLP?

In [None]:
df = pd.read_csv('/Users/jackohagan/datascience/wine/winemag-data_first150k.csv')

In [None]:
df.head()


In [None]:
df['area'] = df['region_1'] + ', ' + df['province'] 

In [None]:
df.head()



In [None]:

unq_address = df['area'].unique()

data = pd.DataFrame()

data['unq_adress'] = unq_address


def findGeocode(city):
    try: geolocator=Nominatim(user_agent="myemail_address@gmail.com")
    except GeocoderTimedOut:
        sleep(10)
        return findGeocode(city)
    return geolocator.geocode(city)

data["loc"] = data['unq_adress'].apply(geolocator.geocode)
data["point"]= data["loc"].apply(lambda loc: tuple(loc.point) if loc else None)
df[['lat', 'lon', 'altitude']] = pd.DataFrame(df['point'].to_list(), index=df.index)

In [None]:
##filter stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')

stop.append('wine')

# Exclude stopwords 
df['description_cleaned'] =  df['country'].astype(str) + ', ' + df['description'].str.lower() 
df['description_cleaned'] = df['description_cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))




In [None]:

#remove punctuation
df["description_cleaned"] = df["description_cleaned"].str.replace('[^\w\s]','')

In [None]:
## tokenize words (turn into list)


df['tokenized_sents']  = df.apply(lambda row: nltk.word_tokenize(row['description_cleaned']), axis=1)




In [None]:
##lemmatize the tokens - get the 'root' of each word

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

df['lemmatied_description'] = df['tokenized_sents'].apply(lemmatize_text)

In [None]:
##add extra columns, bin into 
##points / price - value
df['value'] = df['points']/df['price']
df['price_bucket'] = pd.qcut(df['price'],20,labels=["low", "lowmid", "lowhigh",'midlow','midmid','midhigh','highlow','highmid','highhigh','10','11','12','13','14','15','16','17','18','19','high'])
df['points_bucket'] = pd.cut(df['points'],5, labels=["low", "midlow", "mid",'midhigh','high'])
df['value_bucket'] = pd.qcut(df['value'],20)
df.to_csv('tidieddf.csv')

In [None]:
df= pd.read_csv('tidieddf.csv')

In [None]:
from mlxtend.preprocessing import TransactionEncoder

def associationfunction(df, column):
    


    # combine target and description into one list
    df['description_cleaned'] =  df[column].astype(str) + ', ' + df['description'].str.lower() 
    # Exclude stopwords 
    df['description_cleaned'] = df['description_cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    
    #remove punctuation
    df["description_cleaned"] = df["description_cleaned"].str.replace('[^\w\s]','')
    
    

    ## tokenize words (turn into list)
    df['tokenized_sents']  = df.apply(lambda row: nltk.word_tokenize(row['description_cleaned']), axis=1)


    ##lemmatize the tokens - get the 'root' of each word
    df['lemmatied_description'] = df['tokenized_sents'].apply(lemmatize_text)

    
    
    ##associations 
    a_list = df['lemmatied_description'].tolist()
    ##enconde 1 hot
    te = TransactionEncoder()
    te_ary = te.fit(a_list).transform(a_list)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    
    frequent_itemsets = fpgrowth(df, min_support=0.001, use_colnames=True)
    frequent = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.05)
    
    return frequent

In [None]:
countryass = associationfunction(df=df, column='country')

In [None]:
countryass["antecedent_len"] = countryass["antecedents"].apply(lambda x: len(x))
countryass["consequent_len"] = countryass["consequents"].apply(lambda x: len(x))

countryass = countryass[ (countryass['antecedent_len'] == 1) &
       (countryass['consequent_len'] ==1) ]

In [None]:
countries = df['country'].unique()


In [None]:
##write file to csv
countryass.to_csv('country_similarties.csv')

In [None]:
##for varietys
countryass = associationfunction(df=df, column='variety')

In [None]:
countryass["antecedent_len"] = countryass["antecedents"].apply(lambda x: len(x))
countryass["consequent_len"] = countryass["consequents"].apply(lambda x: len(x))

countryass = countryass[ (countryass['antecedent_len'] == 1) &
       (countryass['consequent_len'] ==1) ]

##write file to csv
countryass.to_csv('variety_similarties.csv')

In [None]:
##for points
countryass= pd.read_csv('variety_similarties.csv')
countryass = associationfunction(df=df, column='points_bucket')

In [None]:
countryass["antecedent_len"] = countryass["antecedents"].apply(lambda x: len(x))
countryass["consequent_len"] = countryass["consequents"].apply(lambda x: len(x))

countryass = countryass[ (countryass['antecedent_len'] == 1) &
       (countryass['consequent_len'] ==1) ]

##write file to csv
countryass.to_csv('points_similarties.csv')
#countryass[countryass['antecedents']  == {'low'}].sort_values('lift',ascending=False)

In [None]:
##for points
countryass= pd.read_csv('points_similarties.csv')
countryass = associationfunction(df=df, column='price_bucket')

In [None]:
countryass["antecedent_len"] = countryass["antecedents"].apply(lambda x: len(x))
countryass["consequent_len"] = countryass["consequents"].apply(lambda x: len(x))

countryass = countryass[ (countryass['antecedent_len'] == 1) &
       (countryass['consequent_len'] ==1) ]

##write file to csv
countryass.to_csv('price_similarties.csv')
#countryass[countryass['antecedents']  == {'low'}].sort_values('lift',ascending=False)

In [None]:
countryass=pd.read_csv('price_similarties.csv')
countryass[countryass['antecedents']  == {'high'}].sort_values('lift',ascending=False)

In [None]:
countryass.sort_values('lift',ascending=False)