### Import packages

In [132]:
import pandas as pd

import string

from sklearn.feature_extraction.text import TfidfVectorizer

### Read in the Meeetup event descriptions

In [133]:
events = pd.read_csv("/home/hans/Documents/Programming/Python/Meetup Talks/Cohpy/Introduction_to_NLP/data/events.csv")
#events.drop_duplicates(subset=["group.name"],inplace=True)

events.shape

(5807, 48)

### What do the descriptions look like?

In [134]:
events.description[100]

'November 2nd, 6:00-9:00 p.m., lets meet and network at Chicagos famed Fogo De Chao steakhouse located in River North at 661 N. Lasale Blvd.  Enjoy some of the finer things in life with us and get down to the meet of your networking, while sampling some of Fogos delicious appetizers. \\n\\nYou will meet and network with others in your profession or in the profession you would like to be in (recruiters are welcome), while enjoying a great night with Chicagos top professionals. \\nThere will be areas designated for various professional and social interest groups to assure you meet people of like interests.\\n Early bird tickets are $12.00 plus Eventbrite fee at: https://www.eventbrite.com/e/meet-and-network-with-us-at-fogo-de-chao-tickets-37898951785?aff=utm_source%3Deb_email%26utm_medium%3Demail%26utm_campaign%3Dnew_event_email&utm_term=eventname_text, so get your tickets now ! \\nTickets include great networking all night long as well as:\\n \\n- Name tags bordered in blue \\n\\n- Recr

### Convert the text data to lowercase

In [135]:
def lower(x): return str(x).lower()

events.description = events.description.apply(lower)

### Remove punctuation

In [136]:
def strip(x): return str(x).translate(str.maketrans('', '', string.punctuation))

events.description = events.description.apply(strip)

### What does our cleaned-up sample look like?

In [137]:
events.description[100]

'november 2nd 600900 pm lets meet and network at chicagos famed fogo de chao steakhouse located in river north at 661 n lasale blvd  enjoy some of the finer things in life with us and get down to the meet of your networking while sampling some of fogos delicious appetizers nnyou will meet and network with others in your profession or in the profession you would like to be in recruiters are welcome while enjoying a great night with chicagos top professionals nthere will be areas designated for various professional and social interest groups to assure you meet people of like interestsn early bird tickets are 1200 plus eventbrite fee at httpswwweventbritecomemeetandnetworkwithusatfogodechaotickets37898951785affutmsource3debemail26utmmedium3demail26utmcampaign3dneweventemailutmtermeventnametext so get your tickets now  ntickets include great networking all night long as well asn n name tags bordered in blue nn recruiters will have a name tag bordered in rednn a sponsored drink ticketnn app

### Build the matrix of TF-IDF coefficients

In [138]:
vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,1),min_df=5)

response = vectorizer.fit_transform(events.description.values)

response_df = pd.DataFrame(response.todense())
response_df.columns = vectorizer.get_feature_names()

response_df

Unnamed: 0,10,100,1000,1000am,1000pm,10010,101,1030,1030am,1045am,...,youjamesmasked,youll,young,youre,youth,youtube,youve,yummy,zhou,zoom
0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.043869,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.091548,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.121223,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.091002,0.0,0.0,0.0,0.0,0.0,0.0
3,0.043869,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.091548,0.0,0.0,0.0,0.0,0.0,0.0
4,0.102797,0.142881,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
5,0.098630,0.137090,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
6,0.000000,0.120615,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.090546,0.0,0.0,0.0,0.0,0.0,0.0
7,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.072095,0.0,0.0,0.0,0.0,0.0,0.0
8,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
9,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


### Print out the most important terms

In [139]:
score_adjustment = 0.75

term_dictionary = dict()
for document in range(len(events.description.values)):
    row = response_df.loc[document]
    max_value = score_adjustment * row.max()
    terms = list(row[row >= max_value].index)
    
    term_dictionary[document] = terms

for idx in (100,222,390):    
    print(f"Tags: {term_dictionary[idx]}\n")
    print(events.description.values[idx])
    print("\n\n\n")

Tags: ['appetizers', 'bordered', 'chicagos', 'meet', 'profession', 'recruiters']

november 2nd 600900 pm lets meet and network at chicagos famed fogo de chao steakhouse located in river north at 661 n lasale blvd  enjoy some of the finer things in life with us and get down to the meet of your networking while sampling some of fogos delicious appetizers nnyou will meet and network with others in your profession or in the profession you would like to be in recruiters are welcome while enjoying a great night with chicagos top professionals nthere will be areas designated for various professional and social interest groups to assure you meet people of like interestsn early bird tickets are 1200 plus eventbrite fee at httpswwweventbritecomemeetandnetworkwithusatfogodechaotickets37898951785affutmsource3debemail26utmmedium3demail26utmcampaign3dneweventemailutmtermeventnametext so get your tickets now  ntickets include great networking all night long as well asn n name tags bordered in blue nn

### Generate value counts of the most common tags

In [140]:
term_list = []
for document in range(len(events.description.values)):
    terms = term_dictionary[document]
    terms = [term for term in terms if not term[0].isdigit()]
    term_list += terms
    
pd.Series(term_list).value_counts()

rsvp                  402
city                  402
drawing               388
notfound              354
class                 313
short                 261
evenings              260
series                260
welcomes              242
toastmasters          223
summer                212
cancelled             212
breakfast             195
public                178
cost                  174
coworking             174
chicago               166
job                   166
classes               164
drop                  163
run                   163
platform              163
fooddrink             162
afraid                162
actors                162
buzz                  162
park                  162
supplies              162
signup                162
sarah                 162
                     ... 
encouraged             62
twominute              62
craft                  62
awardwinning           62
comment                62
rebels                 62
wear                   62
door        

### What happens when we consider N-Grams from the text?

In [141]:
vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,2),min_df=5)

response = vectorizer.fit_transform(events.description.values)

response_df = pd.DataFrame(response.todense())
response_df.columns = vectorizer.get_feature_names()

response_df

Unnamed: 0,10,10 15,10 20,10 club,10 early,10 evite,10 investment,10 meetup,10 minute,10 minutes,...,youve,youve completed,youve know,youve painted,yummy,yummy menu,zhou,zhou art,zoom,zoom meeting
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.040052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.040052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.098303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.094640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Print out the most important terms

In [142]:
score_adjustment = 0.75

term_dictionary = dict()
for document in range(len(events.description.values)):
    row = response_df.loc[document]
    max_value = score_adjustment * row.max()
    terms = list(row[row >= max_value].index)
    
    term_dictionary[document] = terms

for idx in (100,222,390):    
    print(f"Tags: {term_dictionary[idx]}\n")
    print(events.description.values[idx])
    print("\n\n\n")

Tags: ['appetizers', 'bordered', 'chicagos', 'meet', 'meet network', 'profession', 'recruiters']

november 2nd 600900 pm lets meet and network at chicagos famed fogo de chao steakhouse located in river north at 661 n lasale blvd  enjoy some of the finer things in life with us and get down to the meet of your networking while sampling some of fogos delicious appetizers nnyou will meet and network with others in your profession or in the profession you would like to be in recruiters are welcome while enjoying a great night with chicagos top professionals nthere will be areas designated for various professional and social interest groups to assure you meet people of like interestsn early bird tickets are 1200 plus eventbrite fee at httpswwweventbritecomemeetandnetworkwithusatfogodechaotickets37898951785affutmsource3debemail26utmmedium3demail26utmcampaign3dneweventemailutmtermeventnametext so get your tickets now  ntickets include great networking all night long as well asn n name tags bor

### Generate value counts of the most common tags

In [143]:
term_list = []
for document in range(len(events.description.values)):
    terms = term_dictionary[document]
    terms = [term for term in terms if not term[0].isdigit()]
    term_list += terms
    
pd.Series(term_list).value_counts()

city                      402
rsvp                      393
drawing                   388
notfound                  354
class                     313
short                     261
evenings                  260
series short              260
evenings 710              260
series                    260
toastmasters              223
welcomes                  215
cancelled                 212
cancelled summer          212
summer                    212
breakfast                 195
cost                      174
chicago                   166
job                       166
classes                   164
platform                  163
run                       163
drop                      163
club open                 163
welcomes guests           162
welcome bring             162
platform studios          162
supplies arrive           162
low dont                  162
levels                    162
                         ... 
backgrounds walks          62
drinks monday              62
make power