In [126]:
import pandas as pd

df = pd.read_csv("../data/violations.csv")
df.describe(include='all')

Unnamed: 0,case_no,status_dttm,status,code,value,description,violation_stno,violation_sthigh,violation_street,violation_suffix,...,ward,contact_addr1,contact_addr2,contact_city,contact_state,contact_zip,sam_id,latitude,longitude,location
count,24340,24340,24340,24340.0,0.0,24118,24340.0,4998.0,24340,24152,...,24340.0,24263,4334,24275,24271,24251.0,24340.0,24340.0,24340.0,24340
unique,17950,17940,2,586.0,,492,1452.0,810.0,1978,38,...,23.0,13143,2094,1076,124,1576.0,,,,11395
top,HVIOL-650250,2022-12-14 14:57:06,Closed,105.1,,Failure to Obtain Permit,9.0,,Washington,ST,...,14.0,546 East Broadway,Unit 1,Boston,MA,2128.0,,,,"(42.26731000042018, -71.10906000146936)"
freq,36,36,17801,3431.0,,3431,394.0,1208.0,588,17289,...,2360.0,77,143,2442,21878,1672.0,,,,47
mean,,,,,,,,,,,...,,,,,,,141599.583155,42.321076,-71.082249,
std,,,,,,,,,,,...,,,,,,,101801.421976,0.033561,0.032181,
min,,,,,,,,,,,...,,,,,,,22.0,42.2321,-71.18183,
25%,,,,,,,,,,,...,,,,,,,63883.5,42.29507,-71.0975,
50%,,,,,,,,,,,...,,,,,,,118380.0,42.31882,-71.078661,
75%,,,,,,,,,,,...,,,,,,,199384.0,42.348986,-71.061391,


In [None]:
df.head(10)

In [None]:
# Check for duplicate data points
duplicates = df.duplicated()
duplicates = duplicates[duplicates == True]
len(duplicates)

In [None]:
groupby_code = df.groupby('code')
groupby_code.describe()

In [127]:
# Hard-Coded Dictionary of hazardous conditions + violation codes
violations = {"Hot Water": "410.190", "Potable Water": "410.180", "Heating Facilities Required": "410.200", "Temperature Requirements": "410.201",
              "Asbestos Material": "410.353", "Use of Lead Paint Prohibited": "410.502"}

In [128]:
groupby_address = df.groupby('contact_addr1')

#Convert to lower case so that we don't get duplicates
df['contact_addr1'] = df['contact_addr1'].apply(lambda x : str(x).lower())
df['contact_city'] = df['contact_city'].apply(lambda x : str(x).lower())


In [None]:
import matplotlib.pyplot as plt
'''Okay to save the data returned by value_counts I am converting it to a df, and to save the df for each violation I am saving it to a dictionary'''
violations_by_building = {} #key: violation name, value: df that contains the location and frequency of the violations

# Go through each of the hazardous conditions violations
for name, code in violations.items():
    # Look at only this violation (code name in the dataset have other values beyond the code # so had to use contains)
    mask = df['code'].str.contains(code, case=False, na=False)
    df_code =  df[mask]
    # Count the number of violations for each address
    count = df_code['contact_addr1'].value_counts()

    #Convert value counts into a new dataframe to save the data
    df_violation = pd.DataFrame(count)
    df_violation = df_violation.reset_index()
    df_violation.columns = ['Location', 'Counts']
    violations_by_building[name] = df_violation
    
    # Plot the count 
    count.plot(kind='bar', rot=0)

    plt.locator_params(axis='x', nbins=10)
    plt.xticks(rotation='vertical')

    plt.title(name)
    plt.xlabel('Building Location')
    plt.ylabel('Number of Violations')
    plt.show()

    # TODO: Save counts and use that to Create a ranking 
  

In [None]:
# Plot tables for each violation with frequency by building
for violation, df in violations_by_building.items():
    
    fig, ax = plt.subplots()
    ax.axis('off')
    ax.axis('tight')
    ax.set_title(violation)
    ax.set_title(violation, fontsize=8, y=(0.037 *len(df.values)), pad=-14)
    t= ax.table(cellText=df.values, colWidths = [0.4]*len(df.columns),  colLabels=df.columns, loc='center')
    t.auto_set_font_size(False) 
    t.set_fontsize(8)
    #fig.tight_layout()
    plt.show()
    
#TODO: fix formatting issues with tables

In [None]:
#Use city and create table
import matplotlib.pyplot as plt
'''Okay to save the data returned by value_counts I am converting it to a df, and to save the df for each violation I am saving it to a dictionary'''
violations_by_city = {} #key: violation name, value: df that contains the location and frequency of the violations

# Go through each of the hazardous conditions violations
for name, code in violations.items():
    # Look at only this violation (code name in the dataset have other values beyond the code # so had to use contains)
    mask = df['code'].str.contains(code, case=False, na=False)
    df_code =  df[mask]
    # Count the number of violations for each address
    count = df_code['contact_city'].value_counts()

    #Convert value counts into a new dataframe to save the data
    df_violation = pd.DataFrame(count)
    df_violation = df_violation.reset_index()
    df_violation.columns = ['Location', 'Counts']
    violations_by_building[name] = df_violation
    
    # Plot the count 
    count.plot(kind='bar', rot=0)

    plt.locator_params(axis='x', nbins=10)
    plt.xticks(rotation='vertical')

    plt.title(name)
    plt.xlabel('City')
    plt.ylabel('Number of Violations')
    plt.show()


In [136]:
# Plot tables for each violation with frequency by city
for violation, df in violations_by_city.items():
    
    fig, ax = plt.subplots()
    ax.axis('off')
    ax.axis('tight')
    ax.set_title(violation)
    ax.set_title(violation, fontsize=8, y=(0.037 *len(df.values)), pad=-14)
    t= ax.table(cellText=df.values, colWidths = [0.4]*len(df.columns),  colLabels=df.columns, loc='center')
    t.auto_set_font_size(False) 
    t.set_fontsize(8)
    #fig.tight_layout()
    plt.show()
    


