In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# requests to arxiv

In [None]:
import os 
import sys 

In [None]:
import re
import json
import bs4
import requests

In [None]:
class Paper():
    """ A class that holds the information for an Arxiv paper. """

    def __init__(self, number=None, title=None, auths=None,abstract=None,fromfile=None):
        """ Initialize a paper with the arxiv number, title, authors, and abstract. """

        if fromfile is not None:
            self.load(fromfile)

        else:
            self.number = number
            self.title = title
            if auths is not None:
                self.authors = list(auths.values())
                self.author_ids = list(auths.keys())
                self.author_dict = auths.copy()
            else:
                self.authors = None
                self.author_ids = None
                self.author_dict = None

            self.abstract = abstract
            self.link = u'http://arxiv.org/abs/' + number

    def format_line(self,strval, maxlength,pad_left,pad_right):
        """ Function to format a line of a given length.
        Used by the __str__ routine."""
        temp = re.sub("(.{" + "{:d}".format(maxlength) + "})", u"\\1-\n", strval.replace('\n',''), 0, re.DOTALL).strip()

        temp = temp.split('\n')

        temp[-1] = temp[-1] +''.join([u'\u0020']*(maxlength-len(temp[-1])))
        if len(temp) > 1:
            temp[0] = temp[0][:-1]+temp[0][-1]

        return pad_left + (pad_right + '\n' + pad_left).join(temp) + pad_right

    def get_search_string(self):

        return '  '.join([self.abstract.lower(),self.title.lower(), self.number] + [a.lower() for a in self.author_ids] +  [a.lower() for a in self.authors])

    def save(self,filename):
        with open(filename,"a") as f:
            json.dump(vars(self),f)
    def load(self,filename):
        try:
            if os.path.exists(filename):
                with open(filename, 'r') as f:
                    dat = json.load(f)
            else:
                dat = filename
        except TypeError:
            dat = filename
        for key,val in dat.items():
            setattr(self,key,val)


    def __eq__(self,paper):
        return (self.number == paper.number)

    def __ne__(self,paper):
        return not self.__eq__(paper)

    def __le__(self,paper):
        return float(self.number) <= float(paper.number)
    def __ge__(self,paper):
        return float(self.number) >= float(paper.number)
    def __lt__(self,paper):
        return float(self.number) <  float(paper.number)
    def __gt__(self,paper):
        return float(self.number) >  float(paper.number)

    def __str__(self):
        """ Display the paper in a somewhat nice looking way. """

        maxlen = 80
        pad_char = u"\u0025"
        newline_char = u"\u000A"
        space_char = u"\u0020"
        tab_char = space_char + space_char + space_char + space_char
        comma_char = u"\u002C"
        and_char = u"\u0026"


        pad_left = pad_char + pad_char + pad_char + tab_char
        pad_right = tab_char + pad_char + pad_char + pad_char

        if len(self.authors) == 1:
            authstr = self.authors[0]
        else:
            authstr = (comma_char + space_char).join(self.authors[:-1])
            authstr += comma_char + space_char + and_char + space_char + self.authors[-1]

        authstr  = self.format_line(authstr,  maxlen, pad_left, pad_right)
        titlestr = self.format_line(self.title, maxlen, pad_left, pad_right)
        linkstr  = self.format_line(self.link, maxlen, pad_left, pad_right)
        border = ''.join([pad_char]*(maxlen + len(pad_left) + len(pad_right)))
        blank_line = pad_left + ''.join([space_char] * maxlen) + pad_right


        strbody = newline_char + \
                border + newline_char + \
                blank_line  + newline_char + \
                titlestr + newline_char + \
                blank_line  + newline_char + \
                linkstr + newline_char + \
                blank_line  + newline_char + \
                authstr + newline_char + \
                blank_line  + newline_char + \
                border + newline_char + \
                newline_char

        # Check for python 2 to convert from unicode
        if sys.version_info < (3,):
            strbody = strbody.encode("utf8","ignore")
        return strbody

In [None]:
def authors_list_to_dict(author_list):

    authors_dict = {}
    for a in author_list:

        if '(' in a:
            # We have an affiliation
            a = a.split('(')[0]
            #a = ' ' .join(a.split('(')[0])
        temp = a.split()

        if len(temp) > 2:
            # More than two names, take first and last
            name = (temp[0],temp[-1])
        elif len(temp) == 1:
            # Just one name, probably a spacing error
            temp = temp[0].split('.')
            name = (temp[0],temp[-1])
        else:
            # Two names
            name = (temp[0],temp[1])

        authors_dict[name[1]+'_'+name[0][0].upper()] = ' '.join(temp)
    return authors_dict

In [None]:
def read_paper_from_url(number):

    bowl = requests.get('http://arxiv.org/abs/' + str(number))
    soup = bs4.BeautifulSoup(bowl.text, 'html.parser')
    title = soup.find_all(
        'h1', attrs={'class':
                     'title mathjax'})[0].text.split('Title:')[-1].strip()

    authors = [
        x.strip() for x in soup.find_all('div', attrs={'class': 'authors'})[0].
        text.split('Authors:')[-1].split(',')
    ]

    abstract = soup.find_all(
        'blockquote',
        attrs={'class':
               'abstract mathjax'})[0].text.split('Abstract:')[-1].strip()

    return Paper(number, title, authors_list_to_dict(authors), abstract)

In [None]:
read_paper_from_url('1908.04905')

In [None]:
# you’ve captured the return value of get() by bowl
bowl = requests.get('http://arxiv.org/abs/'+ str(1908.04905)) 

In [None]:
#bowl.content;
#bowl.text;
bowl.headers

In [None]:
soup = bs4.BeautifulSoup(bowl.text, 'html.parser')
#soup
title = soup.find_all(
        'h1', attrs={'class':
                     'title mathjax'})[0].text.split('Title:')[-1].strip()
title
authors = [
        x.strip() for x in soup.find_all('div', attrs={'class': 'authors'})[0].
        text.split('Authors:')[-1].split(',')
    ]
authors
abstract = soup.find_all('blockquote',attrs={'class':
               'abstract mathjax'})[0].text.split('Abstract:')[-1].strip()
abstract

# Request to player

In [None]:
import requests
from bs4 import BeautifulSoup as soup
from pprint import pprint

player_name = [
    'Bernd Leno', 'Emiliano Martínez', 'Matt Macey', 'Héctor Bellerín'
]
player = {}
for i in player_name:
    player_page = requests.get(
        'https://www.premierleague.com/players/10483/{}/stats'.format(i))
    cont = soup(player_page.content, 'lxml')

    data = dict(
        (k.contents[0].strip(), v.get_text(strip=True)) for k, v in zip(
            cont.select('.topStat span.stat, .normalStat span.stat'),
            cont.select(
                '.topStat span.stat > span, .normalStat span.stat > span')))
    player[i] = data

pprint(player)

In [None]:
!ls &&pwd 
os.path.isfile("/Applications/Users/wangmiao/Playground/GH/IPython_training/basic/WiderKnowledge/")

In [None]:
pattern=re.compile(r"\.\w{5}")
for root, dirs, files in os.walk("/Users/wangmiao/Playground/GH/IPython_training/basic/WiderKnowledge/"):
    for file in files:
        #if pattern.findall(file)[0] == '.ipynb':
        print( file)

In [None]:
from os import listdir
from os.path import isfile, join
mypath = "/Users/wangmiao/Playground/GH/IPython_training/basic/WiderKnowledge/"
[f for f in listdir(mypath) if isfile(join(mypath, f))]

In [None]:
pattern.findall("Decorators.ipynb")[0]

# `bs4`

In [None]:
import requests, bs4

url = 'https://www.basketball-reference.com/players/a/abrinal01.html'
res = requests.get(url)
res.raise_for_status()

soup = bs4.BeautifulSoup(res.text, 'html.parser')
elems = soup.select('#per_game')

table = soup.find("table", { "id" : "per_game" })
table_rows = table.find_all('tr')

for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text for i in td]
    
    print(row)

In [None]:
import requests, bs4

url = 'https://www.basketball-reference.com/players/a/abrinal01.html'
res = requests.get(url)
res.raise_for_status()

soup = bs4.BeautifulSoup(res.text, 'html.parser')
elems = soup.select('#per_game')

table = soup.find("table", { "id" : "per_game" })
table_rows = table.find_all('tr')

for tr in table_rows:
    td = tr.find_all('td') + tr.find_all('th')
    row = [i.text for i in td]
    print(row)

# Check a line in a file
https://stackoverflow.com/questions/57078822/how-to-check-to-see-if-a-certain-line-is-found-before-a-certain-point-in-a-txt-f

In [None]:
find = 'Is this found'   

with open('xx.txt') as old_file:
  
    lines = old_file.readlines()
    print(lines[2])
    print(lines)
    for line in old_file:
        print(line)
        if line.startswith("This"):
            print("line")
    """
    with open(endfile1, "w") as new_file:
        for num, line in enumerate(lines,1):
            #if line "This is the" in line:
                line_base = num
        for line in lines:
            if not find in line.range(1:num):
                if line.startswith("This is the"):
                    line = newbasecase + line 
    """

In [None]:
with open('test.txt') as f:
    lines = f.read().split('\n')
left = [line.split('=')[0].strip() for line in lines]
right = [line.split('=')[1].strip() for line in lines]
print(left)
print(right)



# Extract from  `xml`

In [None]:
from xml.etree import cElementTree as ET
xmlstr = """<poll title="User Suggested Number of Players" totalvotes="0" name="suggested_numplayers">
<results numplayers="3+"> 
</results></poll>
"""
root = ET.fromstring(xmlstr)
levels= root.findall('poll')
for level in levels:
    totalvotes = level.find('totalvotes').text
    print('totalvotes', totalvotes)

In [None]:
xmlstr
a, b , c= xmlstr.partition("totalvotes=")
c.split()[0]

In [None]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(xmlstr) 
soup.find('poll').get('totalvotes')
for poll in soup.find_all('poll'):
    print (poll.get('totalvotes'))

In [None]:
source = '''<root>
<level>
  <name>Matthias</name>
  <age>23</age>
  <gender>Male</gender>
</level>
<level>
  <name>Foo</name>
  <age>24</age>
  <gender>Male</gender>
</level>
<level>
  <name>Bar</name>
  <age>25</age>
  <gender>Male</gender>
</level>
</root>'''

root = ET.fromstring(source)
levels = root.findall('.//level')
for level in levels:
    name = level.find('name').text
    age = level.find('age').text
    print(name, age)

In [None]:
largerList = [10,10,10,10,10,0,10,10,10,10,15,15,15,15,15,10,10,0,10,10,12,12,12,0]

sublist= [largerList[0]]
previous = largerList[0]
for item in largerList:
    #item 
    if item != previous:
        sublist.append(item)
        #sublist
        previous = item
sublist==[10,0,10,15,10,0,10,12,0]

In [None]:
from itertools import groupby
LargerList = [10,10,10,10,10,0,10,10,10,10,15,15,15,15,15,10,10,0,10,10,12,12,12,0]
sublists = [k for k, _ in groupby(LargerList)]
sublists

In [None]:
import requests
from bs4 import BeautifulSoup
page = requests.get('https://finviz.com/forex_performance.ashx')
soup = BeautifulSoup(page.content, 'html.parser')
forex = soup.find_all("div", {"class": "content "})
print(forex)

# List file 

In [None]:
from os import listdir
from os.path import isdir
#files = [file for file in listdir('../storage') if isdir(file)]

for file in listdir('/Users/wangmiao/Desktop/'):
    print(file)

In [None]:
!echo "CONFIG_PATH = '/usr/local/emarking/config/config.ini'" >> test.py

In [None]:
!cat test.py

In [None]:
!sed -i "33iCONFIG_PATH = '/usr/local/emarking/config/config.ini'" test.py

In [None]:
regex_ = r'^((?!~\$).)+\.(?:xlsx?|csv|txt)$'

In [None]:
all_files = [f_path + filename for f_path, _, filenames in os.walk('./') \
             for filename in filenames if re.search(regex_, filename)]
