In [91]:
import pandas as pd
import numpy as np
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from collections import Counter

In [92]:
#only download this the first time you run the code. After that the packages will already be installed on your comp.
# nltk.download()

In [93]:
#read in the CSVs made in a previous notebook subdividing the 
violations_section = pd.read_csv("../data/violations_section.csv")
violations_TCA = pd.read_csv("../data/violations_TCA.csv")
violations_MCL = pd.read_csv("../data/violations_MCL.csv")
violations_description_id = pd.read_csv("../data/violations_description_id.csv")
violations_remainder = pd.read_csv("../data/violations_remainder.csv")

In [94]:
#group all the CSVs together into a list to make similar functions apply to all
violations_list = [violations_section, violations_TCA, violations_MCL, violations_description_id, violations_remainder]

In [95]:
#next two cells establish an index that will maintain original placement for concatenating dfs downstream
for violation in violations_list:
    assert 'Unnamed: 0' in violation.columns

In [96]:
for violation in violations_list:
    violation.set_index('Unnamed: 0', inplace=True)
    violation.index.rename('Original_Index', inplace=True)

In [97]:
#moving placed text from "id" column to "text" column
violations_description_id.head()

Unnamed: 0_level_0,id,type,desc,text,remedial
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15,(B). It is unlawful for any person to dispose ...,,N,,1.0
438,he or she shall serve notice of the risk and t...,,,,
439,tree at the expense of the property owner,,N,,2.0
707,"human occupation or use, upon failure or refus...",,,,
708,"remove or demolish, such dwelling or structure...",,N,,2.0


In [98]:
violations_description_id['text'] = violations_description_id['id'].values

In [99]:
violations_description_id.head()

Unnamed: 0_level_0,id,type,desc,text,remedial
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15,(B). It is unlawful for any person to dispose ...,,N,(B). It is unlawful for any person to dispose ...,1.0
438,he or she shall serve notice of the risk and t...,,,he or she shall serve notice of the risk and t...,
439,tree at the expense of the property owner,,N,tree at the expense of the property owner,2.0
707,"human occupation or use, upon failure or refus...",,,"human occupation or use, upon failure or refus...",
708,"remove or demolish, such dwelling or structure...",,N,"remove or demolish, such dwelling or structure...",2.0


In [100]:
#replace NaN values in "desc" and "text" columns with empty strings
for violation in violations_list:
    violation.drop('id', axis='columns', inplace=True)
    
    violation['desc'].replace(np.nan, '', regex=True, inplace=True)
    violation['text'].replace(np.nan, '', regex=True, inplace=True)

In [101]:
violations_section.head()

Unnamed: 0_level_0,type,desc,text,remedial
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,BANNERS,Banners,Section 17.32.060 & 17.32.070 - Banners: Bann...,CAAZ_BANNERS
1,BGMATOWNER,Proper Maintenance Req - Owner,Section 16.16.030 (B) - Proper maintenance req...,CAAH_BG_MATOWNER
2,BLDGMAINT,Proper Maintenance Req,Section 16.16.030 (A) - Proper maintenance req...,CAAH_BLDG_MAINT
3,BLDGPERMIT,Building Permit Required,Section 16.28.010 - Building Permit Required: ...,CAAB_BLDG_PERMIT
4,BLDGSCOPE,Scope of Building Code,Section 16.16.020 (B) - The provisions of this...,CAAH_BLDG_SCOPE


In [102]:
violations_TCA.head()

Unnamed: 0_level_0,type,desc,text,remedial
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
106,LANDREG,Landlord Registration Required,Tennessee Code Annotated (TCA) 66-28-101 - Fai...,CAAR_LAND_REG
376,CAAR_LAND_REG,LANDLORD REGISTRATION REQUIRED,Tennessee Code Annotated (TCA) 66-28-101 - Fai...,
807,BT575101A2,Not Manufacturer and Wholesale Retail,T.C.A 57-5-101(a)(2): Except as otherwise prov...,
808,BT575101A3,Not Wholesale and Manufacturer Retail,T.C.A 57-5-101(a)(3): Except as otherwise prov...,
809,BT575103A1,Operating without county city permit,T.C.A 57-5-103(a)(1): It is unlawful to operat...,


In [103]:
violations_MCL.head()

Unnamed: 0_level_0,type,desc,text,remedial
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
481,CAGA_DEX,DOG EXCREMENT,MCL Chapter 8.04.180 – Removal of Excrement. ...,
482,CAGB_HAZ_W,HAZARDOUS WASTE,"MCL Chapter 10.20.150 - Hazardous, pathogenic ...",
483,CAGC_GCL,GARBAGE CONTAINER LIDS,MCL Chapter 10.20.160 – Container Requirements...,
484,CAGD_ID,ILLEGAL DUMPING,MCL Chapter 10.20.320 – Illegal Dumping. It i...,
485,CAGE_LPR,LITTER ON PRIVATE PROPERTY,MCL Chapter 10.24.070 - Litter on private prop...,


In [104]:
violations_description_id.head()

Unnamed: 0_level_0,type,desc,text,remedial
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15,,N,(B). It is unlawful for any person to dispose ...,1.0
438,,,he or she shall serve notice of the risk and t...,
439,,N,tree at the expense of the property owner,2.0
707,,,"human occupation or use, upon failure or refus...",
708,,N,"remove or demolish, such dwelling or structure...",2.0


In [105]:
violations_remainder.head()

Unnamed: 0_level_0,type,desc,text,remedial
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
126,MVENVEND,Mobile Vending,Ordinance BL2006-1283 - Mobile Vendor: The ped...,CAAZ_MVEN_VENDING
196,TRASHCANS,Trash Cans,Ordinance 89-826 - Trash Cans: Refuse containe...,CAAH_TRASH_CANS
209,SWVIOLPTP,Violations - Post Construction Treatment,See SW Code,
210,SWVIOLWQ,Violations for Water Quality Impacts,See SW Code,
211,SWENVCRT,Environmental Court Proceedings,See SW Code,


In [106]:
#grouping the dfs into one 
#Though these steps seem unnecessary it was useful in identifying and removing errors or incomplete entries
master_violations = pd.concat(violations_list)

In [107]:
#viewing entries in each column. 
#"type" column appears to have correct number of values, extra are errors that will be removed downstream
master_violations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 809 entries, 0 to 933
Data columns (total 4 columns):
type        788 non-null object
desc        809 non-null object
text        809 non-null object
remedial    220 non-null object
dtypes: object(4)
memory usage: 31.6+ KB


In [108]:
#these next cells end up with new columns that contain the tokenized words from the "desc" and "text" columns
#this cell writes functions to create the tokens for the two columns
def tokenize_column(column_values):
    return word_tokenize(column_values)
    
def tokenize_row(index, row):
    desc = row['desc']
    text = row['text']
    
    desc_tokenized, text_tokenized = [], []
    
    if desc:
        desc_tokenized = tokenize_column(desc)
        
    if text:
        text_tokenized = tokenize_column(text)

    return (index, desc_tokenized, text_tokenized)

In [109]:
#this code calls the previously made functions to iterate over each row in the dataframe and create tokens
tokenized_data = list()

for index, row in master_violations.iterrows():
    tokenized_row_data = tokenize_row(index, row)
    tokenized_data.append(tokenized_row_data)

In [110]:
#this code places the tokens into a dataframe to merge back to the original
#"Original_index" is important hear for downstream merging
tokenized_df = pd.DataFrame(tokenized_data, columns=['Original_Index', 'tokenized_desc', 'tokenized_text'])
tokenized_df.set_index('Original_Index', inplace=True)

tokenized_df.head()

Unnamed: 0_level_0,tokenized_desc,tokenized_text
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,[Banners],"[Section, 17.32.060, &, 17.32.070, -, Banners,..."
1,"[Proper, Maintenance, Req, -, Owner]","[Section, 16.16.030, (, B, ), -, Proper, maint..."
2,"[Proper, Maintenance, Req]","[Section, 16.16.030, (, A, ), -, Proper, maint..."
3,"[Building, Permit, Required]","[Section, 16.28.010, -, Building, Permit, Requ..."
4,"[Scope, of, Building, Code]","[Section, 16.16.020, (, B, ), -, The, provisio..."


In [111]:
#joining the tokens to the original df
master_violations = master_violations.join(tokenized_df)
master_violations.head()

Unnamed: 0_level_0,type,desc,text,remedial,tokenized_desc,tokenized_text
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,BANNERS,Banners,Section 17.32.060 & 17.32.070 - Banners: Bann...,CAAZ_BANNERS,[Banners],"[Section, 17.32.060, &, 17.32.070, -, Banners,..."
1,BGMATOWNER,Proper Maintenance Req - Owner,Section 16.16.030 (B) - Proper maintenance req...,CAAH_BG_MATOWNER,"[Proper, Maintenance, Req, -, Owner]","[Section, 16.16.030, (, B, ), -, Proper, maint..."
2,BLDGMAINT,Proper Maintenance Req,Section 16.16.030 (A) - Proper maintenance req...,CAAH_BLDG_MAINT,"[Proper, Maintenance, Req]","[Section, 16.16.030, (, A, ), -, Proper, maint..."
3,BLDGPERMIT,Building Permit Required,Section 16.28.010 - Building Permit Required: ...,CAAB_BLDG_PERMIT,"[Building, Permit, Required]","[Section, 16.28.010, -, Building, Permit, Requ..."
4,BLDGSCOPE,Scope of Building Code,Section 16.16.020 (B) - The provisions of this...,CAAH_BLDG_SCOPE,"[Scope, of, Building, Code]","[Section, 16.16.020, (, B, ), -, The, provisio..."


In [112]:
#This is creating subsets previously described to subdivide the work of establishing buckets for the violations
#The subsets were determined based on the "text" column
#They were created by subtracting observable patterns from the large df 
#and grouping the "remainder" as one to be processed similarly

#subset that contains rows with the regular expression "Section #." in text column
master_section = master_violations[master_violations.text.str.match(r"(S|s)ection\s(\d+)\.", na=False)]

#new df where rows containing "Section #" are removed
master_nosection = master_violations.drop(master_section.index)

#subset that contains rows with variations of TCA in text column
master_TCA = master_nosection[master_nosection.text.str.match("(T.C.A|Tennessee Code Annotated)", na=False)]

#new df where rows containing "T.C.A" or Tennessee Code Annotated" are removed
master_noTCA = master_nosection.drop(master_TCA.index)

#subset that contains variations of MCL in text column 
#Be aware MCL and MCL Chapter appear to be different types of codes
master_MCL = master_noTCA[master_noTCA.text.str.match("(M.C.L|MCL Chapter)", na=False)]

#new df where rows containing "M.C.L" or "MCL Chapter" are removed
master_remainder = master_noTCA.drop(master_MCL.index)
#master_remainder

In [113]:
#export csv files containing subsets for section, TCA, MCL, descriptions in id column, and remaining entries.
master_section.to_csv("../data/master_section.csv")
master_TCA.to_csv("../data/master_TCA.csv")
master_MCL.to_csv("../data/master_MCL.csv")
master_remainder.to_csv("../data/master_remainder.csv")

In [114]:
#My task was to find buckets within the MCL df
#At first glance I noticed two different kinds of entries in the "text" column: "MCL Chapter" and "M.C.L"
master_MCL

Unnamed: 0_level_0,type,desc,text,remedial,tokenized_desc,tokenized_text
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
481,CAGA_DEX,DOG EXCREMENT,MCL Chapter 8.04.180 – Removal of Excrement. ...,,"[DOG, EXCREMENT]","[MCL, Chapter, 8.04.180, –, Removal, of, Excre..."
482,CAGB_HAZ_W,HAZARDOUS WASTE,"MCL Chapter 10.20.150 - Hazardous, pathogenic ...",,"[HAZARDOUS, WASTE]","[MCL, Chapter, 10.20.150, -, Hazardous, ,, pat..."
483,CAGC_GCL,GARBAGE CONTAINER LIDS,MCL Chapter 10.20.160 – Container Requirements...,,"[GARBAGE, CONTAINER, LIDS]","[MCL, Chapter, 10.20.160, –, Container, Requir..."
484,CAGD_ID,ILLEGAL DUMPING,MCL Chapter 10.20.320 – Illegal Dumping. It i...,,"[ILLEGAL, DUMPING]","[MCL, Chapter, 10.20.320, –, Illegal, Dumping,..."
485,CAGE_LPR,LITTER ON PRIVATE PROPERTY,MCL Chapter 10.24.070 - Litter on private prop...,,"[LITTER, ON, PRIVATE, PROPERTY]","[MCL, Chapter, 10.24.070, -, Litter, on, priva..."
486,CAGF_LPU,LITTER ON PUBLIC PROPERTY,MCL Chapter 10.24.140 - Litter on vacant lots....,,"[LITTER, ON, PUBLIC, PROPERTY]","[MCL, Chapter, 10.24.140, -, Litter, on, vacan..."
487,CAGG_LVH,LITTER FROM VEHICLES,MCL Chapter 10.24.150 - Litter from vehicles--...,,"[LITTER, FROM, VEHICLES]","[MCL, Chapter, 10.24.150, -, Litter, from, veh..."
488,CAGH_HWG,EXCESS VEGETATION,MCL Chapter 10.26.010 – Every premise includin...,,"[EXCESS, VEGETATION]","[MCL, Chapter, 10.26.010, –, Every, premise, i..."
489,CAGI_GCAN,GARBAGE APPROVED CONTAINERS,MCL Chapter 10.32.120 - All garbage and trash ...,,"[GARBAGE, APPROVED, CONTAINERS]","[MCL, Chapter, 10.32.120, -, All, garbage, and..."
490,CAGJ_GT_RATS,RAT HARBORAGE GARBAGE,MCL Chapter 10.32.130 – Permitting garbage and...,,"[RAT, HARBORAGE, GARBAGE]","[MCL, Chapter, 10.32.130, –, Permitting, garba..."


In [115]:
#new df with only MCL entries in it
MCL_MCL = master_MCL[master_MCL.text.str.match(r"M.C.L")]
MCL_MCL.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76 entries, 735 to 881
Data columns (total 6 columns):
type              76 non-null object
desc              76 non-null object
text              76 non-null object
remedial          0 non-null object
tokenized_desc    76 non-null object
tokenized_text    76 non-null object
dtypes: object(6)
memory usage: 4.2+ KB


In [116]:
#new df with only MCL Chapter in it.
MCL_Chapter = master_MCL[master_MCL.text.str.match(r"MCL Chapter")]

In [117]:
#check to verify that there are no more entries remaining in master_MCL.
MCL_remainder = master_MCL.drop((MCL_MCL|MCL_Chapter).index)
MCL_remainder

Unnamed: 0_level_0,type,desc,text,remedial,tokenized_desc,tokenized_text
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [118]:
#This code is circuitous and I am sure there is a better way. Refactor if time permits.
#It is taking the tokenized words and putting them into a flat list, creating a dictionary of counts for each token,
#converting dictionary to a series and then to a df, resetting the index and adding column titles.
#The df is then sorted based on the "freq" column to observe the most common tokens.

MCL_list = MCL_MCL["tokenized_desc"].tolist()
#MCL_list
MCL_flat_list = [item for sublist in MCL_list for item in sublist]

MCL_kw_freq = dict(Counter(MCL_flat_list))
#MCL_kw_freq

MCL_kw_s = pd.Series(MCL_kw_freq, name="freq")

MCL_kw = pd.DataFrame(data = MCL_kw_s, columns = ["freq"])
MCL_kw = MCL_kw.reset_index()
MCL_kw.columns = ["kw", "freq"]
#MCL_kw.head()
MCL_kw_sorted = MCL_kw.sort_values(by = "freq", ascending = False)
MCL_kw_sorted.head()

Unnamed: 0,kw,freq
12,BB,51
127,permit,9
119,on,8
0,",",6
153,sales,6


In [119]:
#This list results from repetitively removing entries observing remaining rows based on strings in the "desc" column.
#Start with highest ranked kw and continue down the list until the list become manageable to look at each entry.
#Then add words on a case by case basis. Pay close attention to the words selected to minimize possible overlap with
#other categories.
#In the end this list will constitue a "category" and will be used to identift violations in the other df.
searchfor = [
    'BB ', 'consume', 'caterer', 'beer', 'moral', 'application', "issuance requirements", 
    "Distance", "patron", "attire", "privilege", "suspension", "reproductions", "exempt", 
    "liquor"]
MCL_drop = MCL_MCL.drop(MCL_MCL[MCL_MCL["desc"].str.contains('|'.join(searchfor), case = False)].index)
MCL_drop

Unnamed: 0_level_0,type,desc,text,remedial,tokenized_desc,tokenized_text
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
735,BB708020,Sales without a permit,M.C.L 7.08.020: No person shall sell beer with...,,"[Sales, without, a, permit]","[M.C.L, 7.08.020, :, No, person, shall, sell, ..."
739,BB708030E,Special Event - sales on premises,M.C.L 7.08.030E: A retailer's special events p...,,"[Special, Event, -, sales, on, premises]","[M.C.L, 7.08.030E, :, A, retailer, 's, special..."


In [120]:
#Process for MCL_MCL is repeated here for MCL_Chapter. Since it is repetitive perhaps can create a function or loop here.
#Refactor if time permits.

Chapter_list = MCL_Chapter["tokenized_desc"].tolist()
Chapter_flat_list = [item for sublist in Chapter_list for item in sublist]


Chapter_kw_freq = dict(Counter(Chapter_flat_list))
#Chapter_kw_freq

Chapter_kw_s = pd.Series(Chapter_kw_freq, name="freq")

Chapter_kw = pd.DataFrame(data = Chapter_kw_s, columns = ["freq"])
Chapter_kw = Chapter_kw.reset_index()
Chapter_kw.columns = ["kw", "freq"]
#MCL_kw.head()
Chapter_kw_sorted = Chapter_kw.sort_values(by = "freq", ascending = False)
Chapter_kw_sorted.head()

Unnamed: 0,kw,freq
19,GARBAGE,5
39,SEWAGE,4
36,RAT,4
21,HARBORAGE,4
0,",",3


In [121]:
#MCL_Chapter[MCL_Chapter["desc"].str.contains("chicken", case = False)]

Junk = [
    "garbage", "sew", "vermin", "water", "plumbing", "excrement", 
    "litter", "junk", "waste", "dumping", "vegetation", "chickens"]
Chapter_drop = MCL_Chapter.drop(
    MCL_Chapter[MCL_Chapter["desc"].str.contains('|'.join(Junk), case = False)].index)
MCL_Chapter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 481 to 724
Data columns (total 6 columns):
type              25 non-null object
desc              25 non-null object
text              25 non-null object
remedial          0 non-null object
tokenized_desc    25 non-null object
tokenized_text    25 non-null object
dtypes: object(6)
memory usage: 2.0+ KB


In [122]:
#Begin work on remainder df.
master_remainder

Unnamed: 0_level_0,type,desc,text,remedial,tokenized_desc,tokenized_text
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
15,,N,(B). It is unlawful for any person to dispose ...,1,[N],"[(, B, ), ., It, is, unlawful, for, any, perso..."
438,,,he or she shall serve notice of the risk and t...,,[],"[he, or, she, shall, serve, notice, of, the, r..."
439,,N,tree at the expense of the property owner,2,[N],"[tree, at, the, expense, of, the, property, ow..."
707,,,"human occupation or use, upon failure or refus...",,[],"[human, occupation, or, use, ,, upon, failure,..."
708,,N,"remove or demolish, such dwelling or structure...",2,[N],"[remove, or, demolish, ,, such, dwelling, or, ..."
895,,N,4. A valid identification card showing a re...,10635,[N],"[4, ., A, valid, identification, card, showing..."
897,,,A. Department of Health,,[],"[A, ., Department, of, Health]"
898,,,B. Fire Marshal Approval,,[],"[B, ., Fire, Marshal, Approval]"
899,,,C. Certificate of Registration or Tax Exempt...,,[],"[C., Certificate, of, Registration, or, Tax, E..."
900,,,D. Letter from charitable organization (only ...,,[],"[D., Letter, from, charitable, organization, (..."


In [123]:
master_remainder.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205 entries, 15 to 933
Data columns (total 6 columns):
type              184 non-null object
desc              205 non-null object
text              205 non-null object
remedial          15 non-null object
tokenized_desc    205 non-null object
tokenized_text    205 non-null object
dtypes: object(6)
memory usage: 11.2+ KB


In [124]:
#Groups in remainder can be grouped by type column. Subset remainder based on first two letters of type column.
#BB appears to be similar to MCL_MCL. Append remainder_BB to MCL_MCL, then add to the list of search terms.
remainder_BB = master_remainder[master_remainder.type.str.match(r"BB*", na = False)]
remainder_BB
remainder_noBB = master_remainder.drop(master_remainder[master_remainder.type.str.match(r"BB*", na = False)].index)
#remainder_noBB

In [125]:
MCL_BB = MCL_MCL.append(remainder_BB)

In [126]:
#Find top ranked keywords from "desc" column to help find entries not captured by previous keyword list

remBB_list = remainder_BB["tokenized_desc"].tolist()
remBB_flat_list = [item for sublist in remBB_list for item in sublist]


remBB_kw_freq = dict(Counter(remBB_flat_list))
#Chapter_kw_freq

remBB_kw_s = pd.Series(remBB_kw_freq, name="freq")

remBB_kw = pd.DataFrame(data = remBB_kw_s, columns = ["freq"])
remBB_kw = remBB_kw.reset_index()
remBB_kw.columns = ["kw", "freq"]
#MCL_kw.head()
remBB_kw_sorted = remBB_kw.sort_values(by = "freq", ascending = False)
remBB_kw_sorted.head(10)

Unnamed: 0,kw,freq
31,No,4
19,Event,3
40,Special,3
0,",",2
12,Can,2
74,or,2
78,penalties,2
79,permit,2
80,permits,2
36,Permits,2


In [127]:
#More comprehensive list of keywords to capture more BB entries.
#Remaining entries pertain to permits, difficult to describe without much overlap.
#Perhaps should belong in another category about permits.

BeerBoard = [
    'BB ', 'consume', 'caterer', 'beer', 'moral', 'application', "issuance requirements", 
    "Distance", "patron", "attire", "privilege", "suspension", "reproductions", "exempt", 
    "liquor", "post laws", "license", "special event", "drive-through", "sealed containers", "under 18", "no delivery", 
    "owners and partners", "over 21", "scertain", "payment types", "health department", "fire marshal", 
    "sports authority", "llc", "past violations"]
MCLBB_drop = MCL_BB.drop(MCL_BB[MCL_BB["desc"].str.contains('|'.join(BeerBoard), case = False)].index)
MCLBB_drop

Unnamed: 0_level_0,type,desc,text,remedial,tokenized_desc,tokenized_text
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
735,BB708020,Sales without a permit,M.C.L 7.08.020: No person shall sell beer with...,,"[Sales, without, a, permit]","[M.C.L, 7.08.020, :, No, person, shall, sell, ..."
891,BBR12,Permittee responsible,"In disciplinary proceedings, it shall be no de...",,"[Permittee, responsible]","[In, disciplinary, proceedings, ,, it, shall, ..."
892,BBR13,Lease or Ownership required,"When applying for an on or off-sale permit, al...",,"[Lease, or, Ownership, required]","[When, applying, for, an, on, or, off-sale, pe..."
909,BBR21,Surrender permits,"Permittees shall surrender their permits, in p...",,"[Surrender, permits]","[Permittees, shall, surrender, their, permits,..."
910,BBR22,Complaints filed in writing,All complaints requested to be heard by the Me...,,"[Complaints, filed, in, writing]","[All, complaints, requested, to, be, heard, by..."
911,BBR24,Old or New Location,An old location is any location that has had a...,,"[Old, or, New, Location]","[An, old, location, is, any, location, that, h..."
915,BBR28,Cannot surrender to avoid penalties,A permit holder may not avoid a hearing or pro...,,"[Can, not, surrender, to, avoid, penalties]","[A, permit, holder, may, not, avoid, a, hearin..."
916,BBR29,Cannot temporary permits with complaints,Temporary permits shall not be issued by the E...,,"[Can, not, temporary, permits, with, complaints]","[Temporary, permits, shall, not, be, issued, b..."
918,BBR30,Grandfathered,1. The grandfather clause of Ordinance 092-246...,,[Grandfathered],"[1, ., The, grandfather, clause, of, Ordinance..."
921,BBR31,Expanded premises,No permittee shall expand the premises for whi...,,"[Expanded, premises]","[No, permittee, shall, expand, the, premises, ..."


In [128]:
#Begin processing entries that begin with CA
#Some of these might belong in MCL_Chapter (Garbage, Trash, Vermin, Sewage). 
remainder_CA = remainder_noBB[remainder_noBB.type.str.match(r"CA*", na = False)]
remainder_CA
remainder_noCA = remainder_noBB.drop(remainder_CA.index)
#remainder_noCA

In [129]:
#Top ranked keywords for CA.
#The keywords seem diverse so print whole list.

remCA_list = remainder_CA["tokenized_desc"].tolist()
remCA_flat_list = [item for sublist in remCA_list for item in sublist]


remCA_kw_freq = dict(Counter(remCA_flat_list))
#Chapter_kw_freq

remCA_kw_s = pd.Series(remCA_kw_freq, name="freq")

remCA_kw = pd.DataFrame(data = remCA_kw_s, columns = ["freq"])
remCA_kw = remCA_kw.reset_index()
remCA_kw.columns = ["kw", "freq"]
#MCL_kw.head()
remCA_kw_sorted = remCA_kw.sort_values(by = "freq", ascending = False)
remCA_kw_sorted

Unnamed: 0,kw,freq
216,THE,35
2,",",21
238,WATER,17
144,ON,14
94,FOR,14
59,DEFECTIVE,13
13,AND,13
139,MISSING,12
66,DRAIN,11
58,DAMAGED,11


In [130]:
#CA keywords.

#remainder_CA.info()
#remainder_CA[remainder_CA["desc"].str.contains("missing", case = False)]
ExteriorRepair = [
    "water", "defective", "damaged", "drain", "vent", "wash", "trap", "electric", "outlet", "conduit", "wir", 
    "switch", "knob", "fuse", "porch", "cords", "junction", "flush", "cover", "window", "seal", "missing", "wall", 
    "floor", "ceiling", "roof", "plumbing", "stair", "handrail", "paint", "repair", "heat", "detector", "bath", 
    "kitchen", "chimney", "overhaul", "exhaust"]
remCA_drop = remainder_CA.drop(
    remainder_CA[remainder_CA["desc"].str.contains('|'.join(ExteriorRepair), case = False)].index)
remCA_drop

Unnamed: 0_level_0,type,desc,text,remedial,tokenized_desc,tokenized_text
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
215,CAAA_BOAT,BOAT STORAGE IN FRONT YARD PROHIBITED,No trailer or watercraft shall be stored in th...,,"[BOAT, STORAGE, IN, FRONT, YARD, PROHIBITED]","[No, trailer, or, watercraft, shall, be, store..."
226,CAAA_LANDLORD_REGSTR,LANDLORD REGISTRATION,Failure to register as a landlord. T.C.A. 66-...,,"[LANDLORD, REGISTRATION]","[Failure, to, register, as, a, landlord, ., T...."
236,CAAA_TRASH_CANS,TRASH CANS,Ord. 89-826 - Trash Cans: Refuse containers sh...,,"[TRASH, CANS]","[Ord, ., 89-826, -, Trash, Cans, :, Refuse, co..."
350,CAAH_TRASH_CANS,TRASH CANS,Ordinance 89-826 - Trash Cans: Refuse containe...,,"[TRASH, CANS]","[Ordinance, 89-826, -, Trash, Cans, :, Refuse,..."
380,CAAZ_CERT_COMPLY,CERTIFICATE COMPLIANCE REQ,"Following issuance of a zoning permit, no stru...",,"[CERTIFICATE, COMPLIANCE, REQ]","[Following, issuance, of, a, zoning, permit, ,..."
394,CAAZ_MVEN_VENDING,MOBILE VENDING,Ordinance BL2006-1283 - Mobile Vendor: The ped...,,"[MOBILE, VENDING]","[Ordinance, BL2006-1283, -, Mobile, Vendor, :,..."
520,CAGZ_OTH,OTHER VIOLATIONS,Other:,,"[OTHER, VIOLATIONS]","[Other, :]"
523,CALL_LANDLORD_REGSTR,LANDLORD REGISTRATION,Failure to register as a landlord. T.C.A. 66-...,,"[LANDLORD, REGISTRATION]","[Failure, to, register, as, a, landlord, ., T...."
658,CAST23,"UNDERPINNING, HATCHWAY, EXTERIOR DOORS","-provide (underpinning, hatchway, exterior doo...",,"[UNDERPINNING, ,, HATCHWAY, ,, EXTERIOR, DOORS]","[-provide, (, underpinning, ,, hatchway, ,, ex..."
692,CAST42,"RODENTS, INFESTATION AND VERMIN","-Rid premises of rodents, infestation and verm...",,"[RODENTS, ,, INFESTATION, AND, VERMIN]","[-Rid, premises, of, rodents, ,, infestation, ..."


In [131]:
#Some entries seem to pertain to MCL_Chapter. Append dfs
CA_append_list = ["remove", "vermin", "garbage", "trash", "storage", "yard"]
CA_append = remCA_drop[remCA_drop["desc"].str.contains('|'.join(CA_append_list), case = False)]

Chapter_CA = MCL_Chapter.append(CA_append)
#Chapter_CA

In [132]:
#Investigate HD subset.
#These might be duplicates for MCL_Chapter or remainder_CA. Append to Chapter_CA.
remainder_HD = remainder_noCA[remainder_noCA.type.str.match(r"HD*", na = False)]
#remainder_HD
remainder_noHD = remainder_noCA.drop(remainder_HD.index)
#remainder_noHD
Chapter_CA_HD = Chapter_CA.append(remainder_HD)
#Chapter_CA_HD

In [134]:
#Find keywords describe Chapter_CA_HD. Start with 
Junk = [
    "garbage", "sew", "vermin", "water", "plumbing", "excrement", "animal", "lumber", "remove", 
    "litter", "junk", "waste", "dumping", "vegetation", "chickens", "trash", "storage", "yard"]
Chapter_CA_HD_drop = Chapter_CA_HD.drop(
    remainder_HD[remainder_HD["desc"].str.contains('|'.join(Junk), case = False)].index)
Chapter_CA_HD_drop
#Remaining entry contains a typo. AMINAL should be ANIMAL.

Unnamed: 0_level_0,type,desc,text,remedial,tokenized_desc,tokenized_text
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
481,CAGA_DEX,DOG EXCREMENT,MCL Chapter 8.04.180 – Removal of Excrement. ...,,"[DOG, EXCREMENT]","[MCL, Chapter, 8.04.180, –, Removal, of, Excre..."
482,CAGB_HAZ_W,HAZARDOUS WASTE,"MCL Chapter 10.20.150 - Hazardous, pathogenic ...",,"[HAZARDOUS, WASTE]","[MCL, Chapter, 10.20.150, -, Hazardous, ,, pat..."
483,CAGC_GCL,GARBAGE CONTAINER LIDS,MCL Chapter 10.20.160 – Container Requirements...,,"[GARBAGE, CONTAINER, LIDS]","[MCL, Chapter, 10.20.160, –, Container, Requir..."
484,CAGD_ID,ILLEGAL DUMPING,MCL Chapter 10.20.320 – Illegal Dumping. It i...,,"[ILLEGAL, DUMPING]","[MCL, Chapter, 10.20.320, –, Illegal, Dumping,..."
485,CAGE_LPR,LITTER ON PRIVATE PROPERTY,MCL Chapter 10.24.070 - Litter on private prop...,,"[LITTER, ON, PRIVATE, PROPERTY]","[MCL, Chapter, 10.24.070, -, Litter, on, priva..."
486,CAGF_LPU,LITTER ON PUBLIC PROPERTY,MCL Chapter 10.24.140 - Litter on vacant lots....,,"[LITTER, ON, PUBLIC, PROPERTY]","[MCL, Chapter, 10.24.140, -, Litter, on, vacan..."
487,CAGG_LVH,LITTER FROM VEHICLES,MCL Chapter 10.24.150 - Litter from vehicles--...,,"[LITTER, FROM, VEHICLES]","[MCL, Chapter, 10.24.150, -, Litter, from, veh..."
488,CAGH_HWG,EXCESS VEGETATION,MCL Chapter 10.26.010 – Every premise includin...,,"[EXCESS, VEGETATION]","[MCL, Chapter, 10.26.010, –, Every, premise, i..."
489,CAGI_GCAN,GARBAGE APPROVED CONTAINERS,MCL Chapter 10.32.120 - All garbage and trash ...,,"[GARBAGE, APPROVED, CONTAINERS]","[MCL, Chapter, 10.32.120, -, All, garbage, and..."
490,CAGJ_GT_RATS,RAT HARBORAGE GARBAGE,MCL Chapter 10.32.130 – Permitting garbage and...,,"[RAT, HARBORAGE, GARBAGE]","[MCL, Chapter, 10.32.130, –, Permitting, garba..."


In [135]:
#SW appears to be final group. Subset and define with keywords.
remainder_SW = remainder_noBB[remainder_noBB.type.str.match(r"SW*", na = False)]
#remainder_SW
remainder_noSW = remainder_noHD.drop(remainder_SW.index)
#The remaining entries I think are part of other entries, duplicates, or headings. No real remaining entries left here.
remainder_noSW

Unnamed: 0_level_0,type,desc,text,remedial,tokenized_desc,tokenized_text
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
15,,N,(B). It is unlawful for any person to dispose ...,1,[N],"[(, B, ), ., It, is, unlawful, for, any, perso..."
438,,,he or she shall serve notice of the risk and t...,,[],"[he, or, she, shall, serve, notice, of, the, r..."
439,,N,tree at the expense of the property owner,2,[N],"[tree, at, the, expense, of, the, property, ow..."
707,,,"human occupation or use, upon failure or refus...",,[],"[human, occupation, or, use, ,, upon, failure,..."
708,,N,"remove or demolish, such dwelling or structure...",2,[N],"[remove, or, demolish, ,, such, dwelling, or, ..."
895,,N,4. A valid identification card showing a re...,10635,[N],"[4, ., A, valid, identification, card, showing..."
897,,,A. Department of Health,,[],"[A, ., Department, of, Health]"
898,,,B. Fire Marshal Approval,,[],"[B, ., Fire, Marshal, Approval]"
899,,,C. Certificate of Registration or Tax Exempt...,,[],"[C., Certificate, of, Registration, or, Tax, E..."
900,,,D. Letter from charitable organization (only ...,,[],"[D., Letter, from, charitable, organization, (..."


In [137]:
#remainder_SW[remainder_SW["desc"].str.contains("treatment", case = False)]

Environment = [
    "post construction", "quality", "environment", "Construction Activit"]
remSW_drop = remainder_SW.drop(
    remainder_SW[remainder_SW["desc"].str.contains('|'.join(Environment), case = False)].index)
remSW_drop

Unnamed: 0_level_0,type,desc,text,remedial,tokenized_desc,tokenized_text
Original_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


Here are the final categories, along with keywords to describe all entries contained within, I have defined from the data I observed:

BeerBoard = [
    'BB ', 'consume', 'caterer', 'beer', 'moral', 'application', "issuance requirements", 
    "Distance", "patron", "attire", "privilege", "suspension", "reproductions", "exempt", 
    "liquor", "post laws", "license", "special event", "drive-through", "sealed containers", "under 18", 
    "no delivery", "owners and partners", "over 21", "scertain", "payment types", "health department", "fire marshal", 
    "sports authority", "llc", "past violations"]

BuildingRepair = [
    "water", "defective", "damaged", "drain", "vent", "wash", "trap", "electric", "outlet", "conduit", "wir", 
    "switch", "knob", "fuse", "porch", "cords", "junction", "flush", "cover", "window", "seal", "missing", "wall", 
    "floor", "ceiling", "roof", "plumbing", "stair", "handrail", "paint", "repair", "heat", "detector", "bath", 
    "kitchen", "chimney", "overhaul", "exhaust", "wood"]

Garbage = [
    "garbage", "sew", "vermin", "water", "plumbing", "excrement", "animal", "lumber", "remove", 
    "litter", "junk", "waste", "dumping", "vegetation", "chickens", "trash", "storage", "yard"]

Environment = [
    "post construction", "quality", "environment", "Construction Activit"]