# Capturing text data

In [12]:
import os

with open(os.path.join('data', 'hieroglyph.txt'), 'r') as f:
  text = f.read()
  print(text)

Hieroglyphic writing dates from c. 3000 BC, and is composed of hundreds of symbols. A hieroglyph can represent a word, a sound, or a silent determinative; and the same symbol can serve different purposes in different contexts. Hieroglyphs were a formal script, used on stone monuments and in tombs, that could be as detailed as individual works of art.



In [3]:
import pandas as pd

df = pd.read_csv(os.path.join('data','news.csv'))

df['title'] = df['title'].str.lower()
df.head()[['publisher', 'title']]

Unnamed: 0,publisher,title
0,Livemint,fed's charles plosser sees high bar for change...
1,IFA Magazine,us open: stocks fall after fed official hints ...
2,IFA Magazine,"fed risks falling 'behind the curve', charles ..."
3,Moneynews,fed's plosser: nasty weather has curbed job gr...
4,NASDAQ,plosser: fed may have to accelerate tapering pace


In [50]:
import requests

r = requests.get('https://quotes.rest/qod.json')

res = r.json()

q = res['contents']['quotes'][0]
print(q['quote'], '\n--', q['author'])

Feeling grateful to or appreciative of someone or something in your life actually attracts more of the things that you appreciate and value into your life. 
-- Christiane Northrup


# Cleaning

In [67]:
import requests

# Fetch a web page
r = requests.get("https://news.ycombinator.com")
print(r.text)

<html lang="en" op="news"><head><meta name="referrer" content="origin"><meta name="viewport" content="width=device-width, initial-scale=1.0"><link rel="stylesheet" type="text/css" href="news.css?sCHxrvQkfxYHN7U5AkST">
        <link rel="shortcut icon" href="favicon.ico">
          <link rel="alternate" type="application/rss+xml" title="RSS" href="rss">
        <title>Hacker News</title></head><body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">
        <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" width="100%" style="padding:2px"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img src="y18.gif" width="18" height="18" style="border:1px white solid;"></a></td>
                  <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
                            <a href="newest">new</a> | <a href="front">past<

In [68]:
import re

# Remove HTML tags using RegEx
pattern = re.compile(r'<.*?>')  # tags look like <...>
print(pattern.sub('', r.text))  # replace them with blank


        
          
        Hacker News
        
                  Hacker News
                            new | past | comments | ask | show | jobs | submit            
                              login
                          
              

            
      1.      Open Assistant – project meant to give everyone access to a great chat based LLM (github.com/laion-ai)
          238 points by pps 2 hours ago  | hide | 91&nbsp;comments        
              
      
                
      2.      ESP32 Buyer’s Guide: Different Chips, Firmware, Sensors (eitherway.io)
          229 points by eitherway 5 hours ago  | hide | 102&nbsp;comments        
              
      
                
      3.      Open Assistant: Conversational AI for Everyone (open-assistant.io)
          108 points by chriskanan 3 hours ago  | hide | 11&nbsp;comments        
              
      
                
      4.      Protocol Labs is laying off 21% of staff (89 people) (protocol.ai)
          74 poin

In [69]:
from bs4 import BeautifulSoup

# Remove HTML tags using Beautiful Soup library
soup = BeautifulSoup(r.text, "html5lib")
print(soup.get_text())


        
          
        Hacker News
        
                  Hacker News
                            new | past | comments | ask | show | jobs | submit            
                              login
                          
              

            
      1.      Open Assistant – project meant to give everyone access to a great chat based LLM (github.com/laion-ai)
          238 points by pps 2 hours ago  | hide | 91 comments        
              
      
                
      2.      ESP32 Buyer’s Guide: Different Chips, Firmware, Sensors (eitherway.io)
          229 points by eitherway 5 hours ago  | hide | 102 comments        
              
      
                
      3.      Open Assistant: Conversational AI for Everyone (open-assistant.io)
          108 points by chriskanan 3 hours ago  | hide | 11 comments        
              
      
                
      4.      Protocol Labs is laying off 21% of staff (89 people) (protocol.ai)
          74 points by throwaway

In [70]:
# Find all articles
summaries = soup.find_all("tr", class_="athing")
summaries[0]

<tr class="athing" id="34654937">
      <td align="right" class="title" valign="top"><span class="rank">1.</span></td>      <td class="votelinks" valign="top"><center><a href="vote?id=34654937&amp;how=up&amp;goto=news" id="up_34654937"><div class="votearrow" title="upvote"></div></a></center></td><td class="title"><span class="titleline"><a href="https://github.com/LAION-AI/Open-Assistant">Open Assistant – project meant to give everyone access to a great chat based LLM</a><span class="sitebit comhead"> (<a href="from?site=github.com/laion-ai"><span class="sitestr">github.com/laion-ai</span></a>)</span></span></td></tr>

In [72]:
# Extract title
# summaries[0].find("a", class_="title").get_text().strip()

 # Normalisation


In [73]:
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"
text = text.lower()
print(text)

the first time you see the second renaissance it may look boring. look at it at least twice and definitely watch part 2. it will change your view of the matrix. are the human people the ones who started the war ? is ai a bad thing ?


In [74]:
import re
text = re.sub(r"[^a-zA-Z0-9]", " ", text)
print(text)

the first time you see the second renaissance it may look boring  look at it at least twice and definitely watch part 2  it will change your view of the matrix  are the human people the ones who started the war   is ai a bad thing  


# Tokenisation + NLTK

In [84]:
import os
import nltk
nltk.data.path.append(os.path.join(os.getcwd(), 'nltk_data'))
text = "Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers."

In [85]:
from nltk.tokenize import word_tokenize

words = word_tokenize(text)
print(words)

['Dr.', 'Smith', 'graduated', 'from', 'the', 'University', 'of', 'Washington', '.', 'He', 'later', 'started', 'an', 'analytics', 'firm', 'called', 'Lux', ',', 'which', 'catered', 'to', 'enterprise', 'customers', '.']


In [86]:
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(text)
print(sentences)

['Dr. Smith graduated from the University of Washington.', 'He later started an analytics firm called Lux, which catered to enterprise customers.']


In [102]:
!git push

To https://github.com/CalesSla/NLPIntroduction.git
   2fc1e8c8..d9765670  main -> main


In [103]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [105]:
words = [w for w in words if w not in stopwords.words('english')]
print(words)

['Dr.', 'Smith', 'graduated', 'University', 'Washington', '.', 'He', 'later', 'started', 'analytics', 'firm', 'called', 'Lux', ',', 'catered', 'enterprise', 'customers', '.']


# Part of Speech Tagging

In [107]:
from nltk import pos_tag

sentence = word_tokenize('I shot an elephan in my pajamas.')
pos_tag(sentence)

[('I', 'PRP'),
 ('shot', 'VBP'),
 ('an', 'DT'),
 ('elephan', 'NN'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('pajamas', 'NN'),
 ('.', '.')]

In [None]:
!git add .
!git commit -m "remove stopwords and implement POS ta"