In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_context('poster')

import ipywidgets as widgets
from ipywidgets import interact
import nbinteract as nbi

pd.options.display.max_rows = 10
pd.options.display.max_columns = 8

# Lecture 10: Working with Text

## Part 1: Cleaning Text

In [None]:
# Issues:

counties1 = [
    'De Witt County',
    'Lac qui Parle County',
    'Lewis and Clark County',
    'St John the Baptist Parish',
]

counties2 = [
    'DeWitt  ',
    'Lac Qui Parle',
    'Lewis & Clark ',
    'St. John the Baptist',
]

Python string methods are useful: https://docs.python.org/3/library/stdtypes.html#string-methods

In particular, string replacements, deletions, and transformations are easy to do.

## Part 2: Extracting Fields


In [None]:
!head data/smallLog.txt

In [None]:
# Small files can be read in completely
with open('data/smallLog.txt') as f:
    log = f.readlines()
log

Works, but what if I want the individual components? E.g.

    ['26', 'Jan', '2014', '10', '47', '58', '-0800']


Wouldn't it be great to tell Python to just match this general pattern?

    [(day)/(month)/(year):(hour):(min):(sec) (timezone)]

### Regular Expressions

Regular expressions let you specify a pattern for a string. Follow attentively!

**Discussion question:** What happens if we remove the brackets?

## Survey Question:

In [None]:
html = '<div class="js-tweet-text-container"><p class="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" lang="en" data-aria-label-part="0">Today, I was honored to be joined by Republicans and Democrats from both the House and Senate, as well as members of my Cabinet - to discuss the urgent need to rebuild and restore America’s depleted infrastructure. <a href="https://t.co/8ByoQJsjTT" rel="nofollow noopener" dir="ltr" data-expanded-url="http://45.wh.gov/UDL9yE" class="twitter-timeline-link" target="_blank" title="http://45.wh.gov/UDL9yE"><span class="tco-ellipsis"></span><span class="invisible">http://</span><span class="js-display-url">45.wh.gov/UDL9yE</span><span class="invisible"></span><span class="tco-ellipsis"><span class="invisible">&nbsp;</span></span></a><a href="https://t.co/BVBRDvHfcC" class="twitter-timeline-link u-hidden" data-pre-embedded="true" dir="ltr">pic.twitter.com/BVBRDvHfcC</a></p></div>'

html_re = ''

In [None]:
urls = [
    'ftp://file_server.com:21/top_secret/life_changing_plans.pdf',
    'https://regexone.com/lesson/introduction#section',
    'file://localhost:4040/zip_file',
    'https://s3cur3-server.com:9999/',
    'market://search/angry%20birds',
]

url_re = ''

## Part 3: Creating Features

In [None]:
vio = pd.read_csv('data/violations.csv', header=0, names=['id', 'date', 'desc'])
desc = vio['desc']
vio.head()

In [None]:
counts = desc.value_counts()

### `pandas` string Functions

In [None]:
# Equivalent to
# pd.Series([len(violation) for violation in desc])

In [None]:
# Equivalent to
# pd.Series([violation[0] for violation in desc])

In [None]:
# Equivalent to
# pd.Series([re.sub('\s*\[.*\]$', '', violation) for violation in desc])

In [None]:
for v in only_desc.value_counts().head(20).index:
    print(v)

Let's define some features.

In [None]:
vio['desc'] = only_desc
vio.head()

In [None]:
with_features = (vio
 .assign(is_clean     = only_desc.str.contains('clean|sanit'))
 .assign(is_high_risk = only_desc.str.contains('high risk'))
 .assign(is_vermin    = only_desc.str.contains('vermin'))
 .assign(is_surface   = only_desc.str.contains('wall|ceiling|floor|surface'))
 .assign(is_human     = only_desc.str.contains('hand|glove|hair|nail'))
 .assign(is_permit    = only_desc.str.contains('permit|certif'))
)
with_features.head()

Now let's see how which violations are most detrimental to the inspection scores:

In [None]:
ins = pd.read_csv('data/inspections.csv',
                  header=0,
                  usecols=[0, 1, 2],
                  names=['id', 'score', 'date'])
ins.head()

In [None]:
count_features = (with_features
 .groupby(['id', 'date'])
 .sum()
 .reset_index()
)
count_features.head()

In [None]:
ins = pd.read_csv('data/inspections.csv',
                  header=0,
                  usecols=[0, 1, 2],
                  names=['id', 'score', 'date'])
ins.head()

In [None]:
with_scores = (
    pd.melt(count_features, id_vars=['id', 'date'],
            var_name='feature', value_name='num_vios')
    .merge(ins, left_on=['id', 'date'], right_on=['id', 'date'])
)
with_scores.head()

In [None]:
sns.factorplot(x='num_vios', y='score',
               col='feature', col_wrap=3,
               kind='box',
               data=with_scores)