# Notebook for topic modeling 

# 0. Imports

In [1]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
# !pip install nltk # can install on terminal or by uncommenting this line
# import nltk; nltk.download('punkt'); nltk.download('stopwords')
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## sklearn imports
from sklearn.feature_extraction.text import CountVectorizer

## lda
# !pip install gensim # can install by uncommenting this line
from gensim import corpora
import gensim

## visualizing LDA--likely need to install
# !pip install pyLDAvis # can install by uncommenting this line
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## random
import random
import string; punctlist = [char for char in string.punctuation] # list of english punctuation marks

Collecting FuzzyTM>=0.4.0 (from gensim)
  Downloading FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Downloading pyFUME-0.3.1-py3-none-any.whl.metadata (9.7 kB)
Collecting simpful (from pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading simpful-2.12.0-py3-none-any.whl.metadata (4.8 kB)
Collecting fst-pso (from pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting miniful (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading miniful-0.0.6.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hDownloading FuzzyTM-2.0.9-py3-none-any.whl (31 kB)
Downloading pyFUME-0.3.1-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m108.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading simpful-2.12.0-py3-none-any.whl (24 kB)
Building wheels for collected packages: fst-pso, miniful
  B

# 0. Load data

In [2]:
ab = pd.read_csv("../public_data/airbnb_text.zip")
ab.head()

Unnamed: 0,id,name,name_upper,neighbourhood_group,price
0,2539,Clean & quiet apt home by the park,CLEAN & QUIET APT HOME BY THE PARK,Brooklyn,149
1,2595,Skylit Midtown Castle,SKYLIT MIDTOWN CASTLE,Manhattan,225
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,150
3,3831,Cozy Entire Floor of Brownstone,COZY ENTIRE FLOOR OF BROWNSTONE,Brooklyn,89
4,5022,Entire Apt: Spacious Studio/Loft by central park,ENTIRE APT: SPACIOUS STUDIO/LOFT BY CENTRAL PARK,Manhattan,80


# 1. Preprocess documents

In this case, each name/name_upper, or listing title, we're treating as a document

## 1.1 Load stopwords list and augment with our own custom ones

In [7]:
list_stopwords = stopwords.words("english")

custom_words_toadd = ['apartment', 'new york', 'nyc',
                      'bronx', 'brooklyn',
                     'manhattan', 'queens', 
                      'staten island']

list_stopwords_new = list_stopwords + custom_words_toadd


## 1.2 Remove stopwords from lowercase version of corpus


In [8]:
## convert to lowercase and a list
corpus_lower = ab.name.str.lower().to_list()
corpus_lower[0:5]

## use wordpunct tokenize and filter out with one
example_listing = corpus_lower[3]
nostop_listing = [word for word in wordpunct_tokenize(example_listing) 
                          if word not in list_stopwords_new]
nostop_listing

['clean & quiet apt home by the park',
 'skylit midtown castle',
 'the village of harlem....new york !',
 'cozy entire floor of brownstone',
 'entire apt: spacious studio/loft by central park']

['cozy', 'entire', 'floor', 'brownstone']

## 1.3 stem and remove non-alpha

Other contexts we may want to leave digits in

In [9]:
## initialize stemmer
porter = PorterStemmer()

## apply to one by iterating
## over the tokens in the list
example_listing_preprocess = [porter.stem(token) 
                            for token in nostop_listing 
                            if token.isalpha() and 
                            len(token) > 2]

example_listing_preprocess


['cozi', 'entir', 'floor', 'brownston']

In [10]:
example_listing
example_listing_preprocess

'cozy entire floor of brownstone'

['cozi', 'entir', 'floor', 'brownston']

In [None]:
## Storage

In [None]:
## convert to lowercase and a list
corpus_lower = ab.name.str.lower().to_list()
corpus_lower[0:5]

## use wordpunct tokenize and filter out with one
example_listing = corpus_lower[3]
nostop_listing = [word for word in wordpunct_tokenize(example_listing) 
                          if word not in list_stopwords_new]
nostop_listing


## OR 
## initialize stemmer
porter = PorterStemmer()

## apply to one by iterating
## over the tokens in the list
example_listing_preprocess = [porter.stem(token) 
                            for token in nostop_listing 
                            if token.isalpha() and 
                            len(token) > 2]

example_listing_preprocess


## 1.4 Activity 1

# The above example performed preprocessing on a single Airbnb listing. We want to generalize this preprocessing across all listings.

# - Embed step two (remove stopwords) and step three (stem) into one or two functions that take in a raw string (eg the raw text of an Airbnb review) and return a preprocessed string 
# - Apply the function iteratively to preprocess all the texts in `corpus_lower`. Output could either be a list where each list element is a string of a list (e.g., `cozy brownstone apt`), or a list of lists where each element is a tokenized string (e.g., `['cozy', 'brownstone', 'apt'])`

# Output is flexible: it could be a list of lists containing tokenized/stemmed text or a list of strings.

## 1.4 Activity 1

The above example performed preprocessing on a single Airbnb listing. We want to generalize this preprocessing across all listings.

- Embed step two (remove stopwords) and step three (stem) into one or two functions that take in a raw string (eg the raw text of an Airbnb review) and return a preprocessed string 
- Apply the function iteratively to preprocess all the texts in `corpus_lower`. Output could either be a list where each list element is a string of a list (e.g., `cozy brownstone apt`), or a list of lists where each element is a tokenized string (e.g., `['cozy', 'brownstone', 'apt'])`

Output is flexible: it could be a list of lists containing tokenized/stemmed text or a list of strings.

In [18]:
def preprocess_text(text):
    if isinstance(text, str):
        tokens = wordpunct_tokenize(text.lower())
        porter = PorterStemmer()
        cleaned_tokens = [porter.stem(token) for token in tokens 
                          if token.isalpha() and token not in list_stopwords_new 
                          and len(token) > 2]
        return cleaned_tokens
    else:
        return []

# Apply the preprocessing function to all texts in the corpus_lower list
processed_texts = [preprocess_text(text) for text in corpus_lower]
processed_texts


[['clean', 'quiet', 'apt', 'home', 'park'],
 ['skylit', 'midtown', 'castl'],
 ['villag', 'harlem', 'new', 'york'],
 ['cozi', 'entir', 'floor', 'brownston'],
 ['entir', 'apt', 'spaciou', 'studio', 'loft', 'central', 'park'],
 ['larg', 'cozi', 'midtown', 'east'],
 ['blissartsspac'],
 ['larg', 'furnish', 'room', 'near', 'way'],
 ['cozi', 'clean', 'guest', 'room', 'famili', 'apt'],
 ['cute', 'cozi', 'lower', 'east', 'side', 'bdrm'],
 ['beauti', 'upper', 'west', 'side'],
 ['central', 'near', 'broadway'],
 ['love', 'room', 'garden', 'best', 'area', 'legal', 'rental'],
 ['wonder', 'guest', 'bedroom', 'singl'],
 ['west', 'villag', 'nest', 'superhost'],
 ['stop', 'studio'],
 ['perfect', 'parent', 'garden'],
 ['chelsea', 'perfect'],
 ['hip', 'histor', 'brownston', 'backyard'],
 ['huge', 'upper', 'east', 'cental', 'park'],
 ['sweet', 'spaciou', 'loft'],
 ['cbg', 'ctybgd', 'helpshaiti'],
 ['cbg', 'help', 'haiti', 'room'],
 ['cbg', 'help', 'haiti'],
 ['maison', 'de', 'bohemian'],
 ['sunni', 'bedroo

### From class solution

In [19]:
list_stopwords = stopwords.words("english")
custom_words_toadd = ['apartment', 'new york', 'nyc',
                      'bronx', 'brooklyn',
                     'manhattan', 'queens',
                      'staten island']
list_stopwords_new = list_stopwords + custom_words_toadd
porter = PorterStemmer()
corpus_lower = ab.name.str.lower().to_list()

# defining function
def process_string(text):
    if pd.isna(text):
        return None
    lower_text = text.lower()
    ## use wordpunct tokenize and filter out with one
    nostop_listing = [word for word in wordpunct_tokenize(lower_text)
                              if word not in list_stopwords_new]
    ## apply to one by iterating
    ## over the tokens in the list
    text_preprocess = [porter.stem(token)
                                for token in nostop_listing
                                if token.isalpha() and
                                len(token) > 2]
    return text_preprocess

# your code here to apply the function
preprocessing = [process_string(text) for text in corpus_lower]
preprocessing[:6]

[['clean', 'quiet', 'apt', 'home', 'park'],
 ['skylit', 'midtown', 'castl'],
 ['villag', 'harlem', 'new', 'york'],
 ['cozi', 'entir', 'floor', 'brownston'],
 ['entir', 'apt', 'spaciou', 'studio', 'loft', 'central', 'park'],
 ['larg', 'cozi', 'midtown', 'east']]

## 2.1 Define the dtm function and select data to transform into a document-term matrix

In [20]:
## function provided
def create_dtm(list_of_strings, metadata):
    """ 
    Function to create dense document-term matrix (DTM) from a list of strings and provided metadata. 
    A sparse DTM is a list of term_index/doc_index tuples: if a given term occurs in a given doc at least once, 
        then this count is listed as a tuple; if not, that term/doc pair is omitted. 
    In a dense DTM, each row is one text (e.g., an Airbnb listing), each column is a term, and 
        each cell indicates the frequency of that word in that text. 
    
    Parameters:
        list_of_strings (Series): each row contains a preprocessed string (need not be tokenized)
        metadata (DataFrame): contains document-level covariates
    
    Returns:
        Dense DTM with metadata on left and then one column per word in lexicon
    """
    
    # initialize a sklearn tokenizer; this helps us tokenize the preprocessed string input
    vectorizer = CountVectorizer(lowercase = True) 
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    print('Sparse matrix form:\n', dtm_sparse[:3]) # take a look at sparse representation
    print()
    
    # switch the dataframe from the sparse representation to the normal dense representation (so we can treat it as regular dataframe)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names_out ())
    print('Dense matrix form:\n', dtm_dense_named.head()) # take a look at dense representation
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), dtm_dense_named], axis = 1) # add back document-level covariates

    return(dtm_dense_named_withid)

In [21]:
## filter out na's
## for shorter runtime, random sampling of 1000
## get metadata for those
## and also renaming price col since it's likely to be corpus word
ab_small = ab.loc[~ab.name.isnull(),
           ['id', 'neighbourhood_group', 'price', 'name']].copy().rename(columns = {'price':
            'price_rawdata'}).sample(n = 1000, random_state = 422)

ab_small['name_lower'] = ab_small['name'].str.lower()
ab_small.head()

Unnamed: 0,id,neighbourhood_group,price_rawdata,name,name_lower
23821,19227560,Queens,100,Super Cozy!,super cozy!
22905,18560625,Brooklyn,30,Beautiful Private Bedroom by Prospect Park,beautiful private bedroom by prospect park
20426,16289576,Manhattan,80,Best Location on the Upper West Side! - Part II,best location on the upper west side! - part ii
2018,893413,Manhattan,2500,Architecturally Stunning Former Synagogue!,architecturally stunning former synagogue!
18790,14882137,Queens,50,"Large, beautiful room near Bushwick","large, beautiful room near bushwick"


## 2.2 Execute the dtm function to create the document-term matrix

In [22]:
## example application on raw lowercase texts; 
dtm_nopre = create_dtm(list_of_strings= ab_small.name_lower,
                      metadata = ab_small[['id', 'neighbourhood_group', 'price_rawdata']])



Sparse matrix form:
   (0, 841)	1
  (0, 281)	1
  (1, 152)	1
  (1, 693)	1
  (1, 157)	1
  (1, 205)	1
  (1, 698)	1
  (1, 653)	1
  (2, 165)	1
  (2, 537)	1
  (2, 637)	1
  (2, 856)	1
  (2, 902)	1
  (2, 939)	1
  (2, 774)	1
  (2, 657)	1
  (2, 471)	1

Dense matrix form:
    001  10  10m  10min  10mins  1100  12mins  14  15  15min  ...  yoga  york  \
0    0   0    0      0       0     0       0   0   0      0  ...     0     0   
1    0   0    0      0       0     0       0   0   0      0  ...     0     0   
2    0   0    0      0       0     0       0   0   0      0  ...     0     0   
3    0   0    0      0       0     0       0   0   0      0  ...     0     0   
4    0   0    0      0       0     0       0   0   0      0  ...     0     0   

   you  your  yu  zen  ღღღsteps  法拉盛中心私人房間獨立衛浴  溫馨大套房  獨一無二的紐約閣樓  
0    0     0   0    0         0              0      0          0  
1    0     0   0    0         0              0      0          0  
2    0     0   0    0         0              0      0  

In [23]:
## show first set of rows/cols
dtm_nopre.head()

## show arbitrary later cols in resulting data
dtm_nopre.shape
dtm_nopre.iloc[0:5, 480:500]

Unnamed: 0,index,id,neighbourhood_group,price_rawdata,001,10,10m,10min,10mins,1100,...,yoga,york,you,your,yu,zen,ღღღsteps,法拉盛中心私人房間獨立衛浴,溫馨大套房,獨一無二的紐約閣樓
0,23821,19227560,Queens,100,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,22905,18560625,Brooklyn,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20426,16289576,Manhattan,80,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2018,893413,Manhattan,2500,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,18790,14882137,Queens,50,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


(1000, 974)

Unnamed: 0,inclusive,incredible,incredibly,indoor,inn,inq,insane,int,interior,international,interns,invincible,inviting,inwood,island,it,italy,its,jefferson,jewel
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 2.3 Use that matrix/column sums to get basic summary stats of top words

In [24]:
## summing each col
top_terms = dtm_nopre[dtm_nopre.columns[4:]].sum(axis = 0)

## sorting from most frequent to least frequent
top_terms.sort_values(ascending = False)

in           367
room         244
private      163
bedroom      152
apartment    130
            ... 
24hr           1
16             1
1940           1
19th           1
fulton         1
Length: 970, dtype: int64

## 2.4 Activity 2: repeat the above but using the preprocessed text data

- Stick with the same random sample of 1000 `ab_small`
- Apply the preprocessing steps from activity 1 to create a new column in `ab_small` with the preprocessed text (if you got stuck on that, try just removing stopwords)
- Use the `create_dtm` function to create a document-term matrix from the preprocessed data
- Use colsums to summarize

In [38]:
# Apply preprocessing to create a new column for preprocessed text
ab_small['name_preprocessed'] = ab_small['name'].apply(preprocess_text)
## Same thing -- ab_small['name_preprocessed'] = ab_small.name_lower.apply(preprocess_text)

# Apply the create_dtm function on the preprocessed text data
dtm_preprocessed = create_dtm(list_of_strings=ab_small['name_preprocessed'],
                              metadata=ab_small[['id', 'neighbourhood_group', 'price_rawdata']])

# Display initial rows/columns of the DTM using a DataFrame display for better readability
display(dtm_preprocessed.head())

# Show the shape of the DTM and specific columns in a readable format
print('Shape of DTM:', dtm_preprocessed.shape)
display(dtm_preprocessed.iloc[0:5, 480:500])

# Calculate and display the most frequent terms using DataFrame sorting for clarity
top_terms_preprocessed = dtm_preprocessed[dtm_preprocessed.columns[4:]].sum().sort_values(ascending=False)
display(pd.DataFrame(top_terms_preprocessed.head(20), columns=['Frequency']))

Unnamed: 0,id,neighbourhood_group,price_rawdata,abcd,abod,access,acidot,acogedor,across,address,...,yard,year,yellow,yoga,york,zen,ღღღstep,法拉盛中心私人房間獨立衛浴,溫馨大套房,獨一無二的紐約閣樓
0,19227560,Queens,100,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,18560625,Brooklyn,30,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,16289576,Manhattan,80,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,893413,Manhattan,2500,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,14882137,Queens,50,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Shape of DTM: (1000, 756)


Unnamed: 0,palac,para,paradis,park,parkway,part,patio,peac,peach,pendulum,penth,penthous,peopl,perfect,perfectli,person,perstay,pet,photo,photograph
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,Frequency
room,246
privat,165
bedroom,159
cozi,97
apt,91
spaciou,88
studio,87
park,73
sunni,68
williamsburg,67


# 3. Use gensim to more automatically preprocess/estimate a topic model

## 3.1 Creating the objects to feed the LDA modeling function

Different outputs described below: 
- Tokenized and preprocessed text 
- Dictionary 
- Corpus 

In [39]:
## Step 1: re-tokenize and store in list
## here, i'm doing with the raw random sample of text
## in activity, you should do with the preprocessed texts
text_raw_tokens = [wordpunct_tokenize(one_text) for one_text in 
                  ab_small.name_lower]


In [40]:
## Step 2: use gensim create dictionary - gets all unique words across documents
text_raw_dict = corpora.Dictionary(text_raw_tokens)
raw_len = len(text_raw_dict) # get length for comparison below

### explore first few keys and values
### see that key is just an arbitrary counter; value is the word itself
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]}


{0: '!', 1: 'cozy', 2: 'super', 3: 'beautiful', 4: 'bedroom'}

In [41]:
## Step 3: filter out very rare and very common words
## here, i'm using the threshold that a word needs to appear in at least
## 5% of docs but not more than 95%
## this is an integer count of docs so i round
lower_bound = round(ab_small.shape[0]*0.05)
upper_bound = round(ab_small.shape[0]*0.95)


### apply filtering to dictionary
text_raw_dict.filter_extremes(no_below = lower_bound,
                             no_above = upper_bound)
print(f'Filtering out very rare and very common words reduced the \
length of dictionary from {str(raw_len)} to {str(len(text_raw_dict))}.')
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]} # show first five entries after filtering

Filtering out very rare and very common words reduced the length of dictionary from 1047 to 31.


{0: '!', 1: 'cozy', 2: 'beautiful', 3: 'bedroom', 4: 'park'}

In [42]:

## Step 4: apply dictionary to TOKENIZED texts
## this creates a mapping between each word 
## in a specific listing and the key in the dictionary.
## for words that remain in the filtered dictionary,
## output is a list where len(list) == n documents
## and each element in the list is a list of tuples
## containing the mappings
corpus_fromdict = [text_raw_dict.doc2bow(one_text) 
                   for one_text in text_raw_tokens]

### can apply doc2bow(one_text, return_missing = True) to print words
### eliminated from the listing bc they're not in filtered dictionary.
### but feeding that one with missing values to
### the lda function can cause errors
corpus_fromdict_showmiss = [text_raw_dict.doc2bow(one_text, return_missing = True)
                            for one_text in text_raw_tokens]
print('Sample of documents represented in dictionary format (with omitted words noted):')
corpus_fromdict_showmiss[:10]

Sample of documents represented in dictionary format (with omitted words noted):


[([(0, 1), (1, 1)], {'super': 1}),
 ([(2, 1), (3, 1), (4, 1), (5, 1)], {'by': 1, 'prospect': 1}),
 ([(0, 1), (6, 1), (7, 1)],
  {'best': 1,
   'ii': 1,
   'location': 1,
   'on': 1,
   'part': 1,
   'side': 1,
   'upper': 1,
   'west': 1}),
 ([(0, 1)],
  {'architecturally': 1, 'former': 1, 'stunning': 1, 'synagogue': 1}),
 ([(2, 1), (8, 1), (9, 1), (10, 1), (11, 1)], {'bushwick': 1}),
 ([(4, 1), (8, 1), (9, 1), (12, 1), (13, 2)],
  {'bath': 1, 'bed': 1, 'by': 1, 'central': 1, 'college': 1, 'hunter': 1}),
 ([(9, 1), (11, 1), (14, 1), (15, 1)], {'bohemian': 1, 'brownstone': 1}),
 ([(16, 1)],
  {'fidi': 1, 'huge': 1, 'loft': 1, 'views': 1, 'w': 1, 'water': 1}),
 ([], {'hillside': 1, 'hotel': 1}),
 ([(5, 1), (9, 1), (11, 1), (14, 1), (15, 1)], {'airy': 1})]

##  3.2 Estimating the model

In [43]:
## Step 5: we're finally ready to estimate the model!
## full documentation here - https://radimrehurek.com/gensim/models/ldamodel.html
## here, we're feeding the lda function:
## (1) the corpus we created from the dictionary,
## (2) a parameter we decide on for the number of topics (k),
## (3) the dictionary itself,
## (4) parameter for number of passes through training data (more means slower), and
## (5) parameter that returns, for each word remaining in dict, the topic probabilities.
## see documentation for many other arguments you can vary
ldamod = gensim.models.ldamodel.LdaModel(corpus_fromdict, 
                                         num_topics = 5, 
                                         id2word=text_raw_dict, 
                                         passes=6, 
                                         alpha = 'auto',
                                         per_word_topics = True)

print(type(ldamod))



<class 'gensim.models.ldamodel.LdaModel'>


## 3.3  Seeing what topics the estimated model discovers

In [44]:
## Post-model 1: explore corpus-wide summary of topics
### getting the topics and top words; can retrieve diff top words
topics = ldamod.print_topics(num_words = 10)
for topic in topics:
    print(topic)


(0, '0.167*"studio" + 0.136*"," + 0.089*"apt" + 0.080*"park" + 0.066*"east" + 0.059*"beautiful" + 0.042*"!" + 0.041*"1" + 0.038*"sunny" + 0.035*"/"')
(1, '0.185*"," + 0.132*"to" + 0.099*"cozy" + 0.087*"-" + 0.071*"/" + 0.057*"brooklyn" + 0.054*"2" + 0.046*"room" + 0.032*"!" + 0.032*"."')
(2, '0.181*"in" + 0.137*"apartment" + 0.071*"the" + 0.067*"bedroom" + 0.062*"of" + 0.055*"1" + 0.046*"private" + 0.041*"brooklyn" + 0.039*"sunny" + 0.036*"-"')
(3, '0.151*"bedroom" + 0.104*"in" + 0.100*"!" + 0.085*"spacious" + 0.073*"manhattan" + 0.073*"and" + 0.064*"apt" + 0.054*"with" + 0.041*"large" + 0.037*"-"')
(4, '0.249*"room" + 0.176*"in" + 0.133*"private" + 0.055*"williamsburg" + 0.043*"near" + 0.039*"/" + 0.032*"cozy" + 0.031*"with" + 0.030*"!" + 0.029*"&"')


In [45]:
    
## Post-model 2: explore topics associated with each document
### for each item in our original dictionary, get list of topic probabilities
l=[ldamod.get_document_topics(item) for item in corpus_fromdict]
### print result
text_raw_tokens[0:5]
l[0:5]

[['super', 'cozy', '!'],
 ['beautiful', 'private', 'bedroom', 'by', 'prospect', 'park'],
 ['best',
  'location',
  'on',
  'the',
  'upper',
  'west',
  'side',
  '!',
  '-',
  'part',
  'ii'],
 ['architecturally', 'stunning', 'former', 'synagogue', '!'],
 ['large', ',', 'beautiful', 'room', 'near', 'bushwick']]

[[(0, 0.047678627),
  (1, 0.77147186),
  (2, 0.0712832),
  (3, 0.05103147),
  (4, 0.058534835)],
 [(0, 0.8634634),
  (1, 0.029863533),
  (2, 0.04278396),
  (3, 0.029619064),
  (4, 0.03427003)],
 [(0, 0.035039898),
  (1, 0.037996426),
  (2, 0.5884082),
  (3, 0.29560193),
  (4, 0.04295361)],
 [(0, 0.07455416),
  (1, 0.0805458),
  (2, 0.11128731),
  (3, 0.64231706),
  (4, 0.091295674)],
 [(0, 0.3875047),
  (1, 0.024831645),
  (2, 0.035103787),
  (3, 0.024438087),
  (4, 0.52812177)]]

### Visualizing 

In [None]:
lda_display = gensimvis.prepare(ldamod, corpus_fromdict, text_raw_dict)
pyLDAvis.display(lda_display)

## 3.4 Activity 3

- Preprocess the texts if you haven't already
- Run the topic model with preprocessed texts
- Play around with other parameters like `n_topics` to find a configuration that produces useful topics

If you get stuck on the preprocessing part, you can use below function and example code for applying it. Then continue as above (start with tokenizing).

In [47]:
# your code here
text_raw_tokens = [wordpunct_tokenize(preprocess_text) for preprocess_text in 
                  ab_small.name_lower]
text_raw_tokens

[['super', 'cozy', '!'],
 ['beautiful', 'private', 'bedroom', 'by', 'prospect', 'park'],
 ['best',
  'location',
  'on',
  'the',
  'upper',
  'west',
  'side',
  '!',
  '-',
  'part',
  'ii'],
 ['architecturally', 'stunning', 'former', 'synagogue', '!'],
 ['large', ',', 'beautiful', 'room', 'near', 'bushwick'],
 ['large',
  '1',
  'bed',
  '/',
  '1',
  'bath',
  'by',
  'central',
  'park',
  ',',
  'hunter',
  'college'],
 ['large', 'bohemian', 'room', 'in', 'brownstone', 'apartment'],
 ['sunny', 'huge', 'fidi', 'loft', 'w', 'water', 'views'],
 ['hillside', 'hotel'],
 ['private', 'room', 'in', 'large', 'airy', 'apartment'],
 ['popular', 'area', 'in', 'bk', 'and', 'close', 'to', 'manhattan'],
 ['cozy', 'room', 'in', 'sweet', 'east', 'village', 'apt', '!'],
 ['cozy', 'private', 'room', 'in', 'williamsburg'],
 ['east', 'flatbush', 'charmer'],
 ['clean', 'and', 'warm', 'room', 'in', 'flushing'],
 ['large',
  ',',
  'sunny',
  'room',
  'in',
  '2',
  'story',
  '2',
  'bedroom',
  'corn

In [None]:
# Hint: example code for preprocessing
def processtext(row, colname):
    
    string_of_col = str(row[colname])
    try:
        processed_string = " ".join([porter.stem(i.lower()) 
                        for i in wordpunct_tokenize(string_of_col) if 
                        i.lower().isalpha() and len(i) >=3]) #and i not in punctlist]) # optional: remove punctuation too 
        return(processed_string)
    except:
        processed_string = "" # to handle data errors where not actually text
        return(processed_string)

# ab_small['text_preprocess'] = ab_small.apply(processtext,
#                             axis = 1,
#                             args = ["name_lower"])
#
# ab_small = ab_small[ab_small.text_preprocess != ""].copy()
#
# ab_small.head()

## GPT

In [56]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Assuming 'ab_small' DataFrame exists and 'name_preprocessed' contains the preprocessed text data

# Vectorize the preprocessed text data
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(ab_small['name_preprocessed'])

# Define and fit the LDA model
n_topics = 5  # Change this number to experiment with different amounts of topics
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda.fit(dtm)

# Helper function to print the topics found by the LDA model
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]])

# Display the topics
lda

# !

# Storage

### Activity two

In [None]:
## ONE WAY I DID IT
# Apply preprocessing to create a new column for preprocessed text
# ab_small['name_preprocessed'] = ab_small['name'].apply(preprocess_text)

# # Using the create_dtm function with the preprocessed texts
# dtm_preprocessed = create_dtm(list_of_strings=ab_small['name_preprocessed'],
#                               metadata=ab_small[['id', 'neighbourhood_group', 'price_rawdata']])



# # Display initial rows/columns of the DTM
# print(dtm_preprocessed.head())
# # dtm_preprocessed.head()

# # Display some arbitrary later columns in the resulting data
# # print('Shape of DTM:', dtm_preprocessed.shape)
# # print(dtm_preprocessed.iloc[0:5, 480:500])

# # Sum each column to find the most frequent terms
# top_terms_preprocessed = dtm_preprocessed[dtm_preprocessed.columns[4:]].sum(axis=0)

# # Sorting from most frequent to least frequent
# top_terms_preprocessed_sorted = top_terms_preprocessed.sort_values(ascending=False)
# # print(top_terms_preprocessed_sorted)
# top_terms_preprocessed_sorted



## ANOTHER WAY I DID IT
# Apply preprocessing to create a new column for preprocessed text
# ab_small['name_preprocessed'] = ab_small['name'].apply(preprocess_text)


# # Apply the create_dtm function on the preprocessed text data
# dtm_preprocessed = create_dtm(list_of_strings=ab_small['name_preprocessed'],
#                               metadata=ab_small[['id', 'neighbourhood_group', 'price_rawdata']])

# # Display initial rows/columns of the DTM using a DataFrame display for better readability
# display(dtm_preprocessed.head())

# # Show the shape of the DTM and specific columns in a readable format
# print('Shape of DTM:', dtm_preprocessed.shape)
# display(dtm_preprocessed.iloc[0:5, 480:500])

# # Calculate and display the most frequent terms using DataFrame sorting for clarity
# top_terms_preprocessed = dtm_preprocessed[dtm_preprocessed.columns[4:]].sum().sort_values(ascending=False)
# display(pd.DataFrame(top_terms_preprocessed.head(20), columns=['Frequency']))
