# Mid Presentation

## Setup

### Imports

In [2]:
# Local Modules
from etl_pipeline.link_extractor import Google, Bing, Yahoo, get_all_links
from etl_pipeline.content_extractor import get_content

# Other Imports
import nltk
import pandas as pd
import string

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt
from wordcloud import WordCloud

### Functions

In [3]:
nltk.download("stopwords")
nltk.download('wordnet')

tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

# clean text
def clean_text(t):
    
    # remove numbers
    t = "".join([i for i in t if not i.isdigit()])

    # remove extra whitespaces and new lines
    t = t.replace("\n", "").strip()

    # remove punctuation 
    t = "".join ([c for c in t if c not in string.punctuation])

    # tokenization
    t = tokenizer.tokenize(t.lower())

    # remove stop words
    t = [word for word in t if word not in stopwords.words("english")]

    # lematization
    t = [lemmatizer.lemmatize(i) for i in t]

    return " ".join(t)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnbergmann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johnbergmann/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Link Extraction

### Single Search Engines

In [4]:
google = Google(company="Credit Suisse")
bing = Bing(company="Credit Suisse")
yahoo = Yahoo(company="Credit Suisse")

#### Google

In [5]:
google_links = google.get_links(20)
google_links = pd.DataFrame(google_links)
google_links.head()

Unnamed: 0,engine,se_link,se_title,se_source
0,Google,https://www.nytimes.com/2023/04/18/us/credit-s...,Beleaguered Swiss Bank Accused of Impeding Hun...,The New York Times
1,Google,https://www.reuters.com/business/finance/ubs-m...,UBS makes changes to buyback programme followi...,Reuters
2,Google,https://www.bloomberg.com/news/articles/2023-0...,UBS Gets Approval to Use Buyback Shares for Cr...,Bloomberg.com
3,Google,https://finance.yahoo.com/news/ubs-gets-approv...,UBS Gets Approval to Use Buyback Shares for Cr...,Yahoo Finance
4,Google,https://www.bloomberg.com/news/articles/2023-0...,Deutsche Bank Targets Asia's Rich as Credit Su...,Bloomberg.com


In [13]:
google_links["se_title"]  #.apply(lambda x: " ".join(x)).str.cat()  #.apply(clean_text).

0     Beleaguered Swiss Bank Accused of Impeding Hun...
1     UBS makes changes to buyback programme followi...
2     UBS Gets Approval to Use Buyback Shares for Cr...
3     UBS Gets Approval to Use Buyback Shares for Cr...
4     Deutsche Bank Targets Asia's Rich as Credit Su...
5     Swiss government awards $9.7 mln contract rela...
6     Investors rethink risk after Credit Suisse bon...
7     A Lesson From Credit Suisse On The Unintended ...
8     Credit Suisse Failed to Probe Nazi Past, Senat...
9     DEADLINE ALERT: Award Winning Firm Labaton Suc...
10    Breakingviews - Rivals can feast on Credit Sui...
11    Credit Suisse to See Fifth of Wealth Assets Le...
12    Stricken Credit Suisse Brings Forward Quarterl...
13    CS LAWSUIT ALERT: Levi & Korsinsky Reminds Cre...
14    After SVB and Credit Suisse: Whither the finan...
15    UBS modifies share buyback program following C...
16    UBS to Apply Repurchased Shares Toward Credit ...
17    UBS Makes Changes To Buyback Program Follo

In [None]:
text = clean_text()
wordcloud = WordCloud().generate(text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

#### Bing

In [6]:
bing_links = bing.get_links(20)
bing_links = pd.DataFrame(bing_links)
bing_links.head()

Unnamed: 0,engine,se_link,se_title,se_source
0,Bing,https://www.bloomberg.com/news/articles/2023-0...,UBS Gets Approval to Use Buyback Shares for Cr...,Bloomberg L.P.
1,Bing,https://www.bloomberg.com/news/articles/2023-0...,Credit Suisse’s Chris McMillan Exits for Role ...,Bloomberg L.P.
2,Bing,https://www.msn.com/en-us/money/markets/ubs-mo...,UBS modifies share buyback program following C...,Seeking Alpha on MSN
3,Bing,https://www.msn.com/en-us/money/markets/ubs-ma...,UBS makes changes to buyback programme followi...,Reuters on MSN
4,Bing,https://www.msn.com/en-us/money/savingandinves...,Credit Suisse fund outflows widen to $5.6 bill...,MarketWatch on MSN


#### Yahoo

In [8]:
yahoo_links = yahoo.get_links(20)
yahoo_links = pd.DataFrame(yahoo_links)
yahoo_links.head()

Unnamed: 0,engine,se_link,se_title,se_source
0,Yahoo,https://www.wsj.com/articles/credit-suisse-fai...,"Credit Suisse Failed to Probe Nazi Past, Senat...",The Wall Street Journal
1,Yahoo,https://www.nytimes.com/2023/04/18/us/credit-s...,Beleaguered Swiss Bank Accused of Impeding Hun...,New York Times
2,Yahoo,https://seekingalpha.com/article/4594672-credi...,Credit Suisse And UBS Merger: Not Without Cons...,Seeking Alpha
3,Yahoo,https://money.usnews.com/investing/news/articl...,"C.Suisse to Move Forward Q1 Earnings, Report B...",US News & World Report
4,Yahoo,https://finance.yahoo.com/news/ubs-makes-chang...,UBS Makes Changes To Buyback Program Following...,Benzinga via Yahoo Finance


### Multi Search Engines

In [None]:
gogle_bing_links = get_all_links(engines=[Yahoo, Bing], max_articles=20) #20 Articles from Yahoo and Bing

### Filter by Dates

**Note**: This only works for Google

In [None]:
# Show only Google Search Engine

## Content Extraction

### Single Engine

### All Engines

## Full Search