Using the News API to find articles related to rent from different years.

In [1]:
import requests
import re
import csv
import pandas as pd

In [2]:
API_KEY = '3d0b3cf27776446c885f1e8ca8356c8c'
endpoint = 'https://newsapi.org/v2/everything'

def set_query(search_str, sources=None):
    query_params = {
        'apiKey': API_KEY,
        'q': search_str,
        'language': 'en',
        'order': 'relevancy',
    }
    if sources:
        query_params['sources'] = sources
    return query_params

def get_articles(search_str, sources=None) -> pd.DataFrame:
    resp = requests.get(endpoint, params=set_query(search_str, sources))
    resp_json = resp.json()
    #return resp_json # For debugging
    if resp_json['status'] == 'error':
        print(f"Error: {resp_json['message']}")
        return None
    df = pd.json_normalize(resp.json()['articles'])
    return df

def clean_text(text:str):
    text=re.sub(r'[,.;@#?!&$\-]+', ' ', text, flags=re.IGNORECASE)
    text=re.sub(' +', ' ', text, flags=re.IGNORECASE)
    text=re.sub(r'\"', ' ', text, flags=re.IGNORECASE)
    text=text.replace("'", '')
    text=re.sub(r'[^a-zA-Z]', " ", text, flags=re.VERBOSE)
    # Remove commas from the file.
    text=text.replace(',', '')
    text=' '.join(text.split())
    # Remove carriage returns and new lines.
    text=re.sub("\n|\r", "", text)
    ### AS AN OPTION - remove words of a given length............
    stop_words = ['the', 'are', 'and', 'for', 'our', 'but', 'that', 'you', 'your', 'like', 'with', 'was',
                  'can', 'when', 'has', 'not', 'this', 'who', 'what', 'when', 'their', 'they', 'them', 'one', 'she', 'her',
                  'could', 'from', 'his', 'him', 'he']
    text = ' '.join([wd.lower() for wd in text.split() if (len(wd)>2 and (wd.lower() not in stop_words))])
    text = text.strip()
    return text

NewsAPI plan cannot search articles far back in the past. So we'll search the articles we do have access too.

In [3]:
rent_articles = get_articles(
    '("cost of living" OR "housing cost" OR rent) -vacation -car -movie -streaming -moved'
)

In [4]:
rent_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   author       84 non-null     object
 1   title        96 non-null     object
 2   description  92 non-null     object
 3   url          96 non-null     object
 4   urlToImage   91 non-null     object
 5   publishedAt  96 non-null     object
 6   content      96 non-null     object
 7   source.id    33 non-null     object
 8   source.name  96 non-null     object
dtypes: object(9)
memory usage: 6.9+ KB


We have 97 articles to go through. We will clean and isolate the description of the articles for ARM. This reduces the number of articles to 93.

In [5]:
article_titles = [clean_text(title) for title in rent_articles['title']]
article_desc = [clean_text(content) for content in rent_articles[~rent_articles['description'].isna()]['description']]
articles = [title + ' ' + desc for title, desc in zip(article_titles, article_desc)]

Split the words up into arrays and then save as a CSV file.

In [6]:
words_mat = [line.split() for line in articles]

In [7]:
with open('../../data/rent_articles.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for line in words_mat:
        writer.writerow(line)
    file.close()