### Working with Microsoft Word using python-docx
### for more information, visit: https://python-docx.readthedocs.io/en/latest/

In [20]:
#!pip install python-docx

In [8]:
from docx import Document

document = Document('sample.docx')

#print(document.paragraphs)

for p in document.paragraphs:
    print(p.text)


Template for Preparation of Papers for IEEE Sponsored Conferences & Symposia
Frank Anderson, Sam B. Niles, Jr., and Theodore C. Donald, Member, IEEE
Abstract—These instructions give you guidelines for preparing papers for IEEE conferences. Use this document as a template if you are using Microsoft Word 6.0 or later. Otherwise, use this document as an instruction set. Instructions about final paper and figure submissions in this document are for IEEE journals; please use this document as a “template” to prepare your manuscript. For submission guidelines, follow instructions on paper submission system as well as the Conference website. Do not delete the blank line immediately above the abstract; it sets the footnote at the bottom of this column.
INTRODUCTION
T
HIS document is a template for Microsoft Word versions 6.0 or later. If you are reading a paper version of this document, please download the electronic file, ieeeconf_letter.dot (for letter sized paper: 8.5” x 11”) or ieeeconf_A4

### Working with Adobe PDF using PyPDF2
### for more information, visit: https://realpython.com/pdf-python/

In [1]:
#!pip install pypdf2

In [2]:
from PyPDF2 import PdfFileReader, PdfFileWriter

def extract_information(pdf_path):
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        information = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
        page = pdf.getPage(0)
        page.rotateClockwise(90)
        full_text = page.extractText()
    
    pdfWriter = PdfFileWriter()
    pdfWriter.addPage(page)
    with open('rotate_pages.pdf', 'wb') as fh:
        pdfWriter.write(fh)
    

    txt = f"""
    Information about {pdf_path}: 

    Author: {information.author}
    Creator: {information.creator}
    Producer: {information.producer}
    Subject: {information.subject}
    Title: {information.title}
    Number of pages: {number_of_pages}
    """
    
    
    print(txt)
    print(full_text)
    
    return information

if __name__ == '__main__':
    path = '/Users/javid/somefile.pdf'
    extract_information(path)
    
    


    Information about /Users/javid/somefile.pdf: 

    Author: None
    Creator: None
    Producer: None
    Subject: None
    Title: None
    Number of pages: 2
    
!"#$%&'
()&'*+,#"-'
.%/'0(1%)%-+
2345&"&'67")#'()&'67%875)3'0%95:';"#":#5%)'5)'<"=,'
*#/"(-,
!;5,:8%,"&'5,'('-"#$%&'()&',+,#"-'.%/'#(1%)%-+
2345&"&'"7")#'()&'"7%875)3'#%95:'
&"#":#5%)'5)')"=,',#/"(-,'/"8"7()#'#%'('357")'")#5#+'%7"/'('&".5)"&'#5-"'/()3">'
'?534/"'
@'5884,#/(#",'()'%7"/75"='%.'#$"'=%/A5)3'%.'#$"'-"#$%&'()&',+,#"-'#%'&"#":#'()&'
:8(,,5.+'"
7")#,'()&'"7%875)3'#%95:,'.%/'('357")'")#5#+'%7"/'('357")'#5-"'/()3">
''?534/"
'@'0$"'-"#$%&',#(/#,'=5#$':/"(#5)3'()'"),"-B8"'%.')"4/(8')"#=%/A'#/(5)"&'=%/&'7":#%/'
-%&"8,'
#$(#':(9#4/"'#$"'-"()5)3'()&',"-()#5:'/"8(#5%),$59,'B"#="")'A"+'9$/(,",'
5)'&(#(,"#'%.')"=,'()&'="B':/(=8,>''
0$"/"(.#"/C'#$"'-"#$%&'&"7"8%9,'('#(1%)%-+'%.'"7")#'
#%95:':
8(,,",'
4,5)3'
)23/(-,'()&'(,,%:5(#"&'7":#%/,
>''!%75)3'%)C'.%/'"(:$')"=,C'#$"'-"#$%&'"1#/(:#,'
,#/4:#4/"&'5).%/-(#5%)'
,4:$'(,'
")#

### Working with character encodings and HTML

In [13]:
from urllib import request
url = "https://en.wikipedia.org/wiki/Tesla,_Inc."
html = request.urlopen(url).read().decode('utf8')
html



In [15]:
from nltk import word_tokenize
from bs4 import BeautifulSoup
raw = BeautifulSoup(html, 'html.parser').get_text()
print(raw)
#tokens = word_tokenize(raw)
#tokens






Tesla, Inc. - Wikipedia
document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"c4e90d69-b6e6-4cff-8a5a-8a067ea25664","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Tesla,_Inc.","wgTitle":"Tesla, Inc.","wgCurRevisionId":960254708,"wgRevisionId":960254708,"wgArticleId":5533631,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 German-language sources (de)","CS1 French-language sources (fr)","CS1 Danish-language sources (da)","CS1 Dutch-language sources (nl)","CS1 Norwegian-language sources (no)","CS1 Swedish-language sources (sv)","Articles containing potentially dated statements from December 2017",
"All

In [9]:
"résumé".encode("utf-8")

b'r\xc3\xa9sum\xc3\xa9'

In [10]:
b"r\xc3\xa9sum\xc3\xa9".decode("utf-8")

'résumé'

In [11]:
"El Niño".encode("utf-8")

b'El Ni\xc3\xb1o'

In [12]:
b'El Ni\xc3\xb1o'.decode("utf-8")

'El Niño'

### Working with Python requests (REST API) & JSON libraries

In [35]:
#!pip install requests
import requests, json
from pprint import pprint

In [56]:
response = requests.get('https://api.opencorporates.com/companies/nl/17087985')
#response.content

output = json.loads(response.text)

output

{'api_version': '0.4.8',
 'results': {'company': {'name': 'Bover B.V.',
   'company_number': '17087985',
   'jurisdiction_code': 'nl',
   'incorporation_date': None,
   'dissolution_date': None,
   'company_type': 'Besloten Vennootschap',
   'registry_url': 'https://www.kvk.nl/zoeken/handelsregister/#!uitgebreid-zoeken&handelsnaam=&kvknummer=17087985&straat=&postcode=&huisnummer=&plaats=&hoofdvestiging=true&rechtspersoon=true&nevenvestiging=false&zoekvervallen=1&zoekuitgeschreven=1&start=0&initial=0&searchfield=uitgebreidzoeken',
   'branch': None,
   'branch_status': None,
   'inactive': False,
   'current_status': 'Active',
   'created_at': '2011-01-12T21:50:57+00:00',
   'updated_at': '2019-09-02T15:31:37+00:00',
   'retrieved_at': '2019-08-10T01:17:34+00:00',
   'opencorporates_url': 'https://opencorporates.com/companies/nl/17087985',
   'source': {'publisher': 'Kamer van Koophandel (KvK)',
    'url': 'https://www.kvk.nl/zoeken/handelsregister/#!uitgebreid-zoeken&handelsnaam=&kvknu