In [1]:
import pandas as pd
from omegaconf import OmegaConf
import os
from bs4 import BeautifulSoup
import spacy

pd.set_option("display.max_colwidth", 200)
ROOT = ".."


In [2]:
config = OmegaConf.load(os.path.join(ROOT, "src/config/config.yaml"))


In [3]:
data = pd.read_csv(os.path.join(ROOT, config.data.raw))


In [4]:
sorted(data.columns.to_list())

['author',
 'description',
 'header_image',
 'keywords',
 'published_at',
 'publisher',
 'raw_description',
 'scraped_at',
 'short_description',
 'title',
 'url']

In [5]:
data.iloc[9].to_frame()


Unnamed: 0,9
title,Morgan Stanley Tries to Stave Off Ratings Cut
url,https://www.cnbc.com/2012/04/05/morgan-stanley-tries-to-stave-off-ratings-cut.html
published_at,2012-04-05T06:53:57+0000
author,
publisher,CNBC
short_description,"James Gorman, Morgan Stanley’s chief executive, has been in discussions with Moody’s in an attempt to maintain its credit ratings and stave off a downgrade that could diminish the bank’s ability t..."
keywords,"cnbc, Articles, Morgan Stanley, Goldman Sachs Group Inc, Citigroup Inc, Bank of America Corp, Business News, Economy, World Economy, Europe News, source:tagname:Financial Times"
header_image,https://image.cnbcfm.com/api/v1/image/19414059-Morgan_stanly5_new.jpg?v=1354732729
raw_description,"<div class=""group""><p>James Gorman, Morgan Stanley’s chief executive, has been in discussions with Moody’s in an attempt to maintain its credit ratings and stave off a downgrade that could diminis..."
description,"James Gorman, Morgan Stanley’s chief executive, has been in discussions with Moody’s in an attempt to maintain its credit ratings and stave off a downgrade that could diminish the bank’s ability t..."


In [6]:
soup = BeautifulSoup(data.loc[9, "raw_description"], "html.parser")
soup.text == data.loc[9, "description"]


False

In [7]:
soup.text


'James Gorman, Morgan Stanley’s chief executive, has been in discussions with Moody’s in an attempt to maintain its credit ratings and stave off a downgrade that could diminish the bank’s ability to buy the rest of Citigroup brokerage Smith Barney, according to people familiar with the matter.,Morgan Stanley owns 51 percent of Smith Barney, and holds an option, which kicks in at the end of May, to increase its stake to 65 percent. Taking full control of the brokerage is a centrepiece of Mr Gorman’s strategy. Morgan Stanley declined to comment.People familiar with the bank’s thinking have said Morgan Stanley could consider buying all of Smith Barney outright, but its ultimate decision will depend on price. Analysts have valued Citi’s remaining Smith Barney stake at around $10 billion.Morgan Stanley would most likely have to issue debt to fund the purchase, people say. That would become more expensive if Morgan Stanley is downgraded. Moody’s put Morgan Stanley, along with five other bank

In [8]:
data.isna().sum()


title                  0
url                    0
published_at           0
author               228
publisher              0
short_description     16
keywords               0
header_image           0
raw_description       31
description           32
scraped_at             0
dtype: int64

In [9]:
data.describe().T


Unnamed: 0,count,unique,top,freq
title,625,625,Santoli’s Wednesday market notes: Could September’s stock shakeout tee up strength for the fourth quarter?,1
url,625,625,https://www.cnbc.com/2021/09/29/santolis-wednesday-market-notes-could-septembers-stock-shakeout-tee-up-strength-for-the-fourth-quarter.html,1
published_at,625,625,2021-09-29T17:09:39+0000,1
author,397,201,Lee Brodie,24
publisher,625,1,CNBC,625
short_description,609,609,"This is the daily notebook of Mike Santoli, CNBC's senior markets commentator, with ideas about trends, stocks and market statistics.",1
keywords,625,581,"cnbc, Articles, CNBC TV, Fast Money, source:tagname:CNBC US Source",10
header_image,625,479,https://sc.cnbcfm.com/applications/cnbc.com/staticcontent/img/cnbc_logo.gif?v=1524171804&w=720&h=405,141
raw_description,594,594,"<div class=""group""><p><em>This is the daily notebook of Mike Santoli, CNBC's senior markets commentator, with ideas about trends, stocks and market statistics.</em></p><ul><li>A muted, inconclusiv...",1
description,593,593,"This is the daily notebook of Mike Santoli, CNBC's senior markets commentator, with ideas about trends, stocks and market statistics.A muted, inconclusive bounce that has left the indexes fully wi...",1


In [10]:
data.isna().sum()


title                  0
url                    0
published_at           0
author               228
publisher              0
short_description     16
keywords               0
header_image           0
raw_description       31
description           32
scraped_at             0
dtype: int64

In [11]:
print(data.shape)
data_dropped = data.dropna().reset_index(drop=True)
print(data_dropped.shape)


(625, 11)
(376, 11)


In [12]:
data[data["description"].isna()].isna().sum()


title                 0
url                   0
published_at          0
author               15
publisher             0
short_description     2
keywords              0
header_image          0
raw_description      31
description          32
scraped_at            0
dtype: int64

In [13]:
data[data["raw_description"].isna()].isna().sum()


title                 0
url                   0
published_at          0
author               14
publisher             0
short_description     1
keywords              0
header_image          0
raw_description      31
description          31
scraped_at            0
dtype: int64

In [14]:
print(data.loc[0, "short_description"])
print()
print(data.loc[0, "description"])


This is the daily notebook of Mike Santoli, CNBC's senior markets commentator, with ideas about trends, stocks and market statistics.

This is the daily notebook of Mike Santoli, CNBC's senior markets commentator, with ideas about trends, stocks and market statistics.A muted, inconclusive bounce that has left the indexes fully within yesterday's low-to-high range all morning so far.


In [15]:
data.loc[0, "description"]


"This is the daily notebook of Mike Santoli, CNBC's senior markets commentator, with ideas about trends, stocks and market statistics.A muted, inconclusive bounce that has left the indexes fully within yesterday's low-to-high range all morning so far."

In [16]:
data.loc[0, "description"].split(".")[0]


"This is the daily notebook of Mike Santoli, CNBC's senior markets commentator, with ideas about trends, stocks and market statistics"

Steps for preprocessing:
1. if description is empty and raw description exists, then strip all html tags from raw_description
2. if short_description is empty and description exists, then take the first sentence from description
3. if author is empty, then just replace with "Unknown"

# Description if raw description exists

In [17]:
idx = data[data["description"].isna() & ~data["raw_description"].isna()].index
data.loc[idx, ["description", "raw_description"]]


Unnamed: 0,description,raw_description
372,,"<div class=""group""></div>"


As we can see description is empty because raw description consists of tag without content in it.

# Short description is empty but description exists

In [18]:
idx = data[data["short_description"].isna() & ~data["description"].isna()].index
print(len(idx))
data.loc[idx, ["short_description", "description"]]


14


Unnamed: 0,short_description,description
4,,"President Donald Trump hailed the U.S.-led intervention in Syria as ""perfectly executed,"" adding that the military campaign to degrade Bashar Assad's chemical weapons capability had accomplished i..."
19,,"In Monday’s Web Extra, Pete Najarian reveals where he’s seeing put buying. Also why stocks are plunging in Japan. This content is only available online - you won't find these trades on TV. _______..."
84,,"COOPERSTOWN, N.Y., Oct. 1, 2012 /PRNewswire/ -- The National Baseball Hall of Fame and Museum is adding to its art collection with the donation of a portrait depicting one of the sport's most famo..."
157,,"What was Wall Street saying about earnings season, Googlehitting an all-time high, Facebook’s1 billion users and European bank stocks? Find out in this week’s CNBC.com Stock Blog Roundup.Third-qua..."
184,,"Ireland's High Court on Thursday ruled that a 850 million euro ($1 billion) data center planned by Apple in the west of Ireland may proceed, dismissing an environmental challenge made by three peo..."
281,,"ARCHBOLD, Ohio, Oct. 1, 2012 /PRNewswire/ -- Paul S. Siebenmorgen, President and CEO of Farmers & Merchants State Bank in Archbold, Ohio, announced that construction will begin in the fall on the ..."
371,,An Air India passenger plane flying to the United States was escorted by British fighter jets to land in London on Thursday after a bomb threat.Air India said flight AI 191 from Mumbai to Newark h...
408,,"TEMPLE CITY, Calif., Oct. 1, 2012 /PRNewswire/ -- Temple City Unified School District and Chevron Energy Solutions today announced the completion of a transformative solar and energy efficiency pr..."
473,,"SAN FRANCISCO, Oct. 8, 2012 /PRNewswire/ -- TalentBin, the talent search engine that turbo-charges talent discovery across the web, today announced the launch of TalentBin Scores, a new addition t..."
482,,"Britain is drawing up contingency plans for the unlikely event it has to walk away from divorce talks with the European Union without a deal, Brexit minister David Davis said on Sunday.Prime Minis..."


In [19]:
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes(
    "tagger",
    "attribute_ruler",
    "lemmatizer",
    "ner",
)
nlp.pipe_names


['tok2vec', 'parser']

In [20]:
def get_summary(text: str, nlp: spacy.Language) -> str:
    """Return the first sentence of the text where . is the delimeter.

    Parameters
    ----------
    text : str
        full text

    Returns
    -------
    str
        the first sentence of the full text
    """

    sents_generator = nlp(text).sents
    return next(sents_generator).text


print(get_summary("Hello", nlp))
print(get_summary("Hello. World", nlp))
text = """President Donald Trump hailed the U.S.-led intervention in Syria as "perfectly executed," adding that the military campaign to degrade Bashar Assad's chemical weapons capability had accomplished its goals.Less than a day after U.S., British and French forces targeted suspected chemical weapons sites in retaliation"""
print(get_summary(text, nlp))


Hello
Hello.
President Donald Trump hailed the U.S.-led intervention in Syria as "perfectly executed," adding that the military campaign to degrade Bashar Assad's chemical weapons capability had accomplished its goals.


In [21]:
text = data.loc[idx[0], "description"]
print(text)
print()
print(get_summary(text, nlp))


President Donald Trump hailed the U.S.-led intervention in Syria as "perfectly executed," adding that the military campaign to degrade Bashar Assad's chemical weapons capability had accomplished its goals.Less than a day after U.S., British and French forces targeted suspected chemical weapons sites in retaliation to an attack that left dozens of civilians dead last week, Trump thanked the U.S. coalition partners.Yet in an echo of former president George W. Bush, Trump used words that ultimately came back to haunt his predecessor, by pronouncing "Mission Accomplished." That characterization raised questions about whether Western forces would intervene again if Assad used chemical weapons again, or if the conflict escalated amid Russia's growing bellicosity."A perfectly executed strike last night. Thank you to France and the United Kingdom for their wisdom and the power of their fine Military. Could not have had a better result. Mission Accomplished!" Trump said in a Twitter post.Defens

In [22]:
data.loc[idx, "short_description"] = data.loc[idx, "description"].apply(lambda text: get_summary(text, nlp))


In [23]:
data.loc[idx, ["short_description", "description"]]


Unnamed: 0,short_description,description
4,"President Donald Trump hailed the U.S.-led intervention in Syria as ""perfectly executed,"" adding that the military campaign to degrade Bashar Assad's chemical weapons capability had accomplished i...","President Donald Trump hailed the U.S.-led intervention in Syria as ""perfectly executed,"" adding that the military campaign to degrade Bashar Assad's chemical weapons capability had accomplished i..."
19,"In Monday’s Web Extra, Pete Najarian reveals where he’s seeing put buying.","In Monday’s Web Extra, Pete Najarian reveals where he’s seeing put buying. Also why stocks are plunging in Japan. This content is only available online - you won't find these trades on TV. _______..."
84,"COOPERSTOWN, N.Y., Oct. 1, 2012 /PRNewswire/ -- The National Baseball Hall of Fame and Museum is adding to its art collection with the donation of a portrait depicting one of the sport's most famo...","COOPERSTOWN, N.Y., Oct. 1, 2012 /PRNewswire/ -- The National Baseball Hall of Fame and Museum is adding to its art collection with the donation of a portrait depicting one of the sport's most famo..."
157,"What was Wall Street saying about earnings season, Googlehitting an all-time high, Facebook’s1 billion users and European bank stocks?","What was Wall Street saying about earnings season, Googlehitting an all-time high, Facebook’s1 billion users and European bank stocks? Find out in this week’s CNBC.com Stock Blog Roundup.Third-qua..."
184,"Ireland's High Court on Thursday ruled that a 850 million euro ($1 billion) data center planned by Apple in the west of Ireland may proceed, dismissing an environmental challenge made by three peo...","Ireland's High Court on Thursday ruled that a 850 million euro ($1 billion) data center planned by Apple in the west of Ireland may proceed, dismissing an environmental challenge made by three peo..."
281,"ARCHBOLD, Ohio, Oct. 1, 2012 /PRNewswire/ -- Paul S. Siebenmorgen, President and CEO of Farmers & Merchants State Bank in Archbold, Ohio, announced that construction will begin in the fall on the ...","ARCHBOLD, Ohio, Oct. 1, 2012 /PRNewswire/ -- Paul S. Siebenmorgen, President and CEO of Farmers & Merchants State Bank in Archbold, Ohio, announced that construction will begin in the fall on the ..."
371,An Air India passenger plane flying to the United States was escorted by British fighter jets to land in London on Thursday after a bomb threat.,An Air India passenger plane flying to the United States was escorted by British fighter jets to land in London on Thursday after a bomb threat.Air India said flight AI 191 from Mumbai to Newark h...
408,"TEMPLE CITY, Calif., Oct. 1, 2012 /PRNewswire/ -- Temple City Unified School District and Chevron Energy Solutions today announced the completion of a transformative solar and energy efficiency pr...","TEMPLE CITY, Calif., Oct. 1, 2012 /PRNewswire/ -- Temple City Unified School District and Chevron Energy Solutions today announced the completion of a transformative solar and energy efficiency pr..."
473,"SAN FRANCISCO, Oct. 8, 2012 /PRNewswire/ -- TalentBin, the talent search engine that turbo-charges talent discovery across the web, today announced the launch of TalentBin Scores, a new addition t...","SAN FRANCISCO, Oct. 8, 2012 /PRNewswire/ -- TalentBin, the talent search engine that turbo-charges talent discovery across the web, today announced the launch of TalentBin Scores, a new addition t..."
482,"Britain is drawing up contingency plans for the unlikely event it has to walk away from divorce talks with the European Union without a deal, Brexit minister David Davis said on Sunday.","Britain is drawing up contingency plans for the unlikely event it has to walk away from divorce talks with the European Union without a deal, Brexit minister David Davis said on Sunday.Prime Minis..."


# If author is empty then replace with `Unknown`

In [24]:
authors_nan_count = data["author"].isna().sum()
print(authors_nan_count)
print(authors_nan_count / len(data))


228
0.3648


In [25]:
data["author"].fillna("Unknown", inplace=True)


In [26]:
print(data["author"].isna().sum())


0


# Overall result

In [27]:
data.isna().sum()


title                 0
url                   0
published_at          0
author                0
publisher             0
short_description     2
keywords              0
header_image          0
raw_description      31
description          32
scraped_at            0
dtype: int64

In [28]:
data.dtypes


title                object
url                  object
published_at         object
author               object
publisher            object
short_description    object
keywords             object
header_image         object
raw_description      object
description          object
scraped_at           object
dtype: object

# Date

In [29]:
data["published_at"] = data["published_at"].apply(lambda x: pd.to_datetime(x).isoformat())

# Saving

In [30]:
columns_to_save = [
    "title",
    "url",
    "published_at",
    "author",
    "short_description",
    "description",
    "keywords",
]


In [31]:
data_to_save = data[columns_to_save].copy()
print(data_to_save.isna().sum())
data_to_save.dropna(inplace=True)

title                 0
url                   0
published_at          0
author                0
short_description     2
description          32
keywords              0
dtype: int64


In [32]:
data_to_save.to_csv(os.path.join(ROOT, config.data.interim), index=False)