# Ch2. Core Python for Data Science excerpt

<div id="toc"></div>

## Unit4_Understanding Basic String Functions

In [1]:
"    Hello, world! \t\t\n" .strip()

'Hello, world!'

In [2]:
"Hello, world!" .split() # Two spaces!

['Hello,', 'world!']

In [4]:
"Hello, world!" .split(" ") # Two spaces!

['Hello,', 'world!']

In [5]:
"www.networksciencelab.com" .split(".")

['www', 'networksciencelab', 'com']

In [6]:
", " .join([ "alpha" , "bravo" , "charlie" , "delta" ])

'alpha, bravo, charlie, delta'

In [7]:
"-" .join( "1.617.305.1985" .split( "." ))

'1-617-305-1985'

In [8]:
" " .join( "This string\n\r has many\t\tspaces" .split())

'This string has many spaces'

In [9]:
"www.networksciencelab.com" .find( ".com" )

21

In [10]:
"www.networksciencelab.com" .count( "." )

2

## Unit5_Choosing the Right Data Structure

In [None]:
myList = list(set(myList)) # Remove duplicates from myList

In [12]:
bigList = [str(i) for i in range(10000000)]
"abc" in bigList # Takes 0.2 sec

False

In [13]:
bigSet = set(bigList)
"abc" in bigSet # Takes 15–30 μsec—10000 times faster!

False

In [14]:
seq = [ "alpha" , "bravo" , "charlie" , "delta" ]
dict(enumerate(seq))

{0: 'alpha', 1: 'bravo', 2: 'charlie', 3: 'delta'}

In [15]:
kseq = "abcd" # A string is a sequence, too
vseq = [ "alpha" , "bravo" , "charlie" , "delta" ]
dict(zip(kseq, vseq))

{'a': 'alpha', 'b': 'bravo', 'c': 'charlie', 'd': 'delta'}

## Unit6_Comprehending Lists through List Comprehension

In [None]:
# Copy myList; same as myList.copy() or myList[:], but less efficient
[x for x in myList]

In [None]:
# Extract non-negative items
[x for x in myList if x >= 0]

In [None]:
# Build a list of squares
[x**2 for x in myList]

In [None]:
# Build a list of valid reciprocals
[1/x for x in myList if x != 0]

In [None]:
# Collect all non-empty lines from the open file infile,
# with trailing and leading whitespaces removed
[l.strip() for l in infile if l.strip()]

In [None]:
[line for line in [l.strip() for l in infile] if line]

In [None]:
(x**2 for x in myList) # Evaluates to <generator object <genexpr> at 0x...>

## Unit7_Counting with Counters

In [17]:
from collections import Counter
phrase = "a man a plan a canal panama"
cntr = Counter(phrase.split())
cntr.most_common()

[('a', 3), ('man', 1), ('plan', 1), ('canal', 1), ('panama', 1)]

In [18]:
cntrDict = dict(cntr.most_common())

In [19]:
cntrDict[ 'a' ]

3

## Unit8_Working with Files

In [None]:
f = open(name, mode="r" )
«read the file»
f.close()

In [None]:
with open(name, mode="r" ) as f:
    «read the file»

In [None]:
f.read() # Read all data as a string or a binary
f.read(n) # Read the first n bytes as a string or a binary
f.readline() # Read the next line as a string
f.readlines() # Read all lines as a list of strings

In [None]:
f.write(line) # Write a string or a binary
f.writelines(ines) # Write a list of strings

## Unit9_Reaching the Web

In [None]:
import urllib.request
try:
    with urllib.request.urlopen( "http://www.networksciencelab.com" ) as doc:
        html = doc.read()
        # If reading was successful, the connection is closed automatically
except:
    print( "Could not open %s" % doc, file=sys.err)
    # Do not pretend that the document has been read!
    # Execute an error handler here

In [20]:
import urllib.parse
URL = "http://networksciencelab.com/index.html;param?foo=bar#content"
urllib.parse.urlparse(URL)

ParseResult(scheme='http', netloc='networksciencelab.com', path='/index.html', params='param', query='foo=bar', fragment='content')

## Unit10_Pattern Matching with Regular Expressions

In [None]:
compiledPattern = re.compile(pattern, flags=0)

### Understanding Regular Expression Language

To define a raw string, put the character r immediately in front of the opening quotation mark. The following two strings are equal, and neither of them
contains a newline character:
```py
"\\n"
r"\n"
```

```py
r"\w[-\w\.]*@\w[-\w]*(\.\w[-\w]*)+"
```
An email address.
```py
r"<TAG\b[^>]*<(.*?)</TAG>"
```
Specific HTML tag with a matching closing tag.
```py
r"[-+]?((\d*\.?\d+)|(\d\.))([eE][-+]?\d+)?"
```
A floating point number.

### Searching, Splitting, and Replacing with Module re

In [24]:
import re

In [25]:
re.split( r"\W" , "Hello, world!" )

['Hello', '', 'world', '']

In [26]:
# Combine all adjacent non-letters
re.split( r"\W+" , "Hello, world!" )

['Hello', 'world', '']

In [27]:
mo = re.match( r"\d+" , "067 Starts with a number" )

In [28]:
mo.group()

'067'

In [29]:
re.match( r"\d+" , "Does not start with a number" )

In [30]:
re.search( r"[a-z]+" , "0010010 Has at least one 010 letter 0010010" , re.I)

<_sre.SRE_Match object; span=(8, 11), match='Has'>

In [31]:
# Case-sensitive version
re.search( r"[a-z]+" , "0010010 Has at least one 010 letter 0010010" )

<_sre.SRE_Match object; span=(9, 11), match='as'>

In [32]:
re.findall( r"[a-z]+" , "0010010 Has at least one 010 letter 0010010" , re.I)

['Has', 'at', 'least', 'one', 'letter']

In [33]:
re.sub( r"[a-z ]+" , "[...]" , "0010010 has at least one 010 letter 0010010" )

'0010010[...]010[...]0010010'

## Unit11_Globbing File Names and Other Strings

In [None]:
glob.glob( "*.txt" )

## Unit12_Pickling and Unpickling Data

In [None]:
# Dump an object into a file
with open( "myData.pickle" , "wb" ) as oFile:
    pickle.dump(object, oFile)

In [None]:
# Load the same object back
with open( "myData.pickle" , "rb" ) as iFile:
    object = pickle.load(iFile)

## Your Turn
 

 http://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers