<a href="https://colab.research.google.com/github/DaiZack/USrealestateTopics/blob/main/USRealEstateTopics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# US Real Estate News Topic Analysis

github link:
[https://github.com/DaiZack/USrealestateTopics](https://github.com/DaiZack/USrealestateTopics)



##1. Load data


In [None]:
import pandas as pd
from google.colab import files
import warnings
warnings.filterwarnings("ignore")

df = pd.read_excel('https://github.com/DaiZack/USrealestateTopics/blob/main/data/USrealestatewithLemma.xlsx?raw=true') # read data from github
df = df.dropna(subset=['content']) # delete article with no content
df['charactor counts'] = df['content'].apply(len) # count number of charactors for each article
df['wordscount'] = df['content'].apply(lambda x: len(x.split()))  # count number of words for each artivle
df['average word length'] = df['charactor counts']/df['wordscount'] # average word length calculation = total #charactors/ total # of words
df.head() # show first 5 rows of the data

Unnamed: 0,title,source,date,Journal,author,wordscount,content,charactor counts,average word length,contentLemma
0,Home Prices Are Soaring. Is That the Fed’s Pro...,The New York Times,31 July 2021,The New York Times,By Jeanna Smialek,1643,Low interest rates are one reason that the hou...,10700,6.512477,low interest rate be one reason that the housi...
1,"A New York City Home for Less Than $350,000",The New York Times,30 July 2021,The New York Times,By C. J. Hughes,2228,Housing prices in New York are notoriously hig...,14629,6.565978,housing price in New York be notoriously high ...
2,The Housing Market Is on Fire. It Doesn’t Need...,The New York Times,29 July 2021,The New York Times,By Steven Rattner,823,On Wednesday the \nFederal Reserve\n \n ...,7728,9.390036,on Wednesday the \n Federal Reserve \n \n ...
3,"Randolph, N.J.: ‘A True New Jersey Hidden Gem’",The New York Times,28 July 2021,The New York Times,By Kathleen Lynn,1287,"Just 40 miles from Midtown, the Morris County ...",8138,6.323232,"just 40 mile from Midtown , the Morris County ..."
4,The Digest,The New York Times,27 July 2021,The New York Times,By The Associated Press and Michael J. de la M...,471,REAL ESTATE\nSales of New Homes\nDeclined 6.6%...,3073,6.524416,real ESTATE \n Sales of New Homes \n decline 6...


# lemmatization (already run to save time)

In [None]:
# import spacy
# nlp = spacy.load('en_core_web_sm')
# def lemmatizing(text):
#   doc = nlp(str(text))
#   return ' '.join([w.lemma_ if not w.is_punct else w.text for w in doc])

# df['contentLemma'] = df['content'].apply(lemmatizing)
# df.to_excel('USrealestatewithLemma.xlsx', index=0)

# Upload stopwords

In [None]:
!nohup wget -O stopwords.txt https://github.com/DaiZack/USrealestateTopics/blob/main/data/stopwords.txt?raw=true

nohup: ignoring input and appending output to 'nohup.out'


# Sklearn LDA

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from ipywidgets import FloatSlider,VBox, Dropdown, IntSlider, interact_manual,ToggleButton
from IPython.display import clear_output 
from tqdm.notebook import tqdm

ngrams = Dropdown(
    options=['1', '2', '3'],
    value='2',
    description='Max gram:',
    disabled=False,
    tooltip = 'max number of words in a token'
)

tokenPattern =  Dropdown(
    options = ['only alphabets','alphabets and number'],
    value = 'only alphabets',
    description = 'keep numbers',
    tooltip = 'whether remove the numbers from text'
)

vectorizer = Dropdown(
    options = ['Counter','Tfidf'],
    value = 'Counter',
    description = 'Vectorizer:',
    tooltip = 'Which Vectorizer to Use'

)

minwordlength = Dropdown(
    options = [2,3,4],
    value = 4,
    description = 'MinWordLen',
    tooltip = 'Minimum length of words'

)

ntopics = IntSlider(
    value = 10,
    min=5,
    max=20,
    step = 1,
    description = '#topics',
    tooltip ='how many topics to generate'
)

maxdf = FloatSlider(
    value = 0.8,
    min=0.5,
    max=1,
    step = 0.05,
    description = 'Max DF',
    tooltip ='at most how many percentages of articles the words can show up'
)

mindf = FloatSlider(
    value = 0.05,
    min=0,
    max=0.3,
    step = 0.05,
    description = 'Min DF',
    tooltip = 'at lease how many percentages of articles the words must show up'
)

lemmatization = Dropdown(
    options = [True, False],
    value = True,
    description = 'Lemmatization',
    tooltip = 'whether to lemmatize the text'
)
# hbox = HBox([ngrams,tokenPattern,ntopics])

def sktopic(ngrams,tokenPattern, minwordlength,vectorizer, ntopics, maxdf, mindf,lemmatization):
    global df
    dfrun =df.__deepcopy__()
    jobs = tqdm(total = 10, desc='starting topic modeling')
    token_pattern = r'\w{'+str(minwordlength)+r',}'
    if tokenPattern == 'only alphabets':
        token_pattern = r'\b[a-zA-Z]{'+str(minwordlength)+r',}\b'
    # print('starting job, parameters:')
    jobs.update(1)
    jobs.set_description('starting job,')
    # print(ngrams,token_pattern, minwordlength,vectorizer, ntopics, maxdf, mindf,lemmatization,'\n')

    from sklearn.decomposition import LatentDirichletAllocation
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    import os
    if os.path.exists('stopwords.txt'):
      with open('stopwords.txt') as f:
        mystopwords = [word.strip().lower() for word in f.read().split() if word]
    else:
      mystopwords = 'english'
    ngrams = 2
    if vectorizer == 'Counter':
      tfVector = CountVectorizer(ngram_range=(1,ngrams), stop_words=mystopwords,token_pattern=token_pattern,max_df=maxdf, min_df=mindf)
    else:
      tfVector = TfidfVectorizer(ngram_range=(1,ngrams), stop_words=mystopwords,token_pattern=token_pattern,max_df=maxdf, min_df=mindf)
    
    lda = LatentDirichletAllocation(n_components=ntopics, random_state=0)
    textcol = 'contentLemma' if lemmatization else 'content'
    jobs.update(1)
    jobs.set_description('converting text to matrix ...')
    vect = tfVector.fit_transform(dfrun[textcol])
    jobs.update(2)
    jobs.set_description('Calculating topics ...')
    topicsresult = lda.fit_transform(vect)
    topicdf = pd.DataFrame(topicsresult, columns = [f'topic{n}' for n in range(ntopics)])
    dfrun = pd.concat([dfrun, topicdf],axis=1)
    dfrun['bestTopic'] = [topic.argmax() for topic in topicsresult]
    jobs.update(3)
    jobs.set_description('Showing and saving output:')
    keywordList = tfVector.get_feature_names()
    keywordsLists = []
    with open(f'topics{ntopics}.txt', 'w') as tp:
      for index, topic in enumerate(lda.components_):
          topickeywords = [keywordList[i] + f' {round(topic[i],2)}' for i in topic.argsort()[::-1][:20]]
          # print(f'Top 20 words for Topic #{index}')
          # print(topickeywords)
          # print('\n')
          keywordsLists.append(topickeywords)
    jobs.update(1)
    jobs.set_description('generating output file')
    keywordsListsdf = pd.DataFrame(keywordsLists, columns=range(20), index=[f'topic{n}' for n in range(ntopics)])
    articleCounts = dfrun[['bestTopic','title']].groupby('bestTopic').count().reset_index()['title']
    keywordsListsdf['Num of articles'] = list(articleCounts)
    print(keywordsListsdf)
    with pd.ExcelWriter(f'dataWithTopics_{ntopics}.xlsx') as w:
      dfrun.to_excel(w, sheet_name='articles',index=0)
      keywordsListsdf.T.to_excel(w, sheet_name = 'keywords')
    jobs.update(1)
    jobs.set_description('downloading file')
    # files.download(f'topics{ntopics}.txt')
    files.download(f'dataWithTopics_{ntopics}.xlsx')
    jobs.update(1)
    jobs.set_description('Finished!')

    

myinteract = interact_manual.options(manual_name='Run Model')
im = myinteract(sktopic,ngrams=ngrams,tokenPattern=tokenPattern, minwordlength=minwordlength,
                vectorizer=vectorizer, ntopics=ntopics, 
                maxdf=maxdf, mindf=mindf,lemmatization=lemmatization)


interactive(children=(Dropdown(description='Max gram:', index=1, options=('1', '2', '3'), value='2'), Dropdown…