In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import Normalizer

### Project Outline - Shared Document
https://docs.google.com/document/d/1gZ5QkT95D7WJ8sPUyC0qURUVcKZQrPub2nR4gyRYPkc/edit?usp=sharing

### Exploratory Data Analysis

In [2]:
hate_crime = pd.read_csv('hate_crime.csv')

#print shape and first 5 rows
print(hate_crime.shape)
hate_crime.head(5)

FileNotFoundError: [Errno 2] No such file or directory: 'hate_crime.csv'

In [None]:
#datatypes and null count
hate_crime.info()

In [None]:
#year range
hate_crime['DATA_YEAR'].agg([np.max, np.min])

In [None]:
#extract Population Group Description to use later
population_groups = hate_crime[['POPULATION_GROUP_CODE', 'POPULATION_GROUP_DESC']].value_counts()
population_groups_df = pd.DataFrame(population_groups).sort_values('POPULATION_GROUP_CODE').reset_index()
population_groups_df = population_groups_df[['POPULATION_GROUP_CODE', 'POPULATION_GROUP_DESC']]
population_groups_df

In [None]:
## Location / Population Data Exploration
fig = plt.plot()
hate_crime['STATE_ABBR'].hist(figsize=(10,10))
plt.xticks(rotation='vertical')
plt.suptitle('STATE_ABBR')
plt.show()


fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20,20))
 
hate_crime['DATA_YEAR'].hist(ax=axes[0,0])
plt.sca(axes[0,0])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('DATA_YEAR', fontsize=20)

hate_crime['POPULATION_GROUP_CODE'].hist(ax=axes[0,1])
plt.sca(axes[0,1])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('POPULATION_GROUP_CODE', fontsize=20)

hate_crime['DIVISION_NAME'].hist(ax=axes[1,0])
plt.sca(axes[1,0])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('DIVISION_NAME', fontsize=20)

hate_crime['REGION_NAME'].hist(ax=axes[1,1])
plt.sca(axes[1,1])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('REGION_NAME', fontsize=20)

plt.show()


#Observations - States, Regions, Divisions are likley skewed by population, look at correlation with population descriptions
#US Territories and Other can be dropped from REgion/Division due to small sample size

In [None]:
# OFFENDER / VICTIM DATA EXPLORATION

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(20,20))

hate_crime['TOTAL_OFFENDER_COUNT'].hist(ax=axes[0,0])
plt.sca(axes[0,0])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('TOTAL_OFFENDER_COUNT', fontsize=20)

hate_crime['VICTIM_COUNT'].hist(ax=axes[0,1])
plt.sca(axes[0,1])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('VICTIM_COUNT', fontsize=20)


hate_crime['MULTIPLE_OFFENSE'].hist(ax=axes[1,0])
plt.sca(axes[1,0])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('MULTIPLE_OFFENSE', fontsize=20)

hate_crime['MULTIPLE_BIAS'].hist(ax=axes[1,1])
plt.sca(axes[1,1])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('MULTIPLE_BIAS', fontsize=20)


hate_crime['OFFENDER_RACE'].hist(ax=axes[2,0])
plt.sca(axes[2,0])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('OFFENDER_RACE', fontsize=20)

hate_crime['TOTAL_INDIVIDUAL_VICTIMS'].hist(ax=axes[2,1])
plt.sca(axes[2,1])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('TOTAL_INDIVIDUAL_VICTIMS', fontsize=20)


plt.show()

##Observations - Offender and Vistim count that are much larger that average should be considered outliers and removed
#Multiple Offense/Multiple Bias have too few observations in the multiple category & should be removed
# Total_Individual_Victims and Victim_count have conflicting meaning, seems liek Victim Coiunt is more accurate, more explorationg needed

In [None]:
fig = plt.plot()
hate_crime['VICTIM_TYPES'].hist(figsize=(10,10))
plt.xticks(rotation='vertical', fontsize=10)
plt.suptitle('VICTIM_TYPES')
plt.show()

## Observations - The vast majority of categories for VICTIM_TYPE, LOCATION NAME and BIAS DESCRIPTION
 # are redundant, many categories can be combined into the top 10-15 for each column

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20,20))

hate_crime['VICTIM_TYPES'].hist(ax=axes[0])
plt.sca(axes[0])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('VICTIM_TYPES', fontsize=20)

hate_crime['LOCATION_NAME'].hist(ax=axes[1])
plt.sca(axes[1])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('LOCATION_NAME', fontsize=20)

hate_crime['BIAS_DESC'].hist(ax=axes[2])
plt.sca(axes[2])
plt.xticks(rotation='vertical', fontsize=18)
plt.title('BIAS_DESC', fontsize=20)

plt.show()

## Observations - The vast majority of categories for VICTIM_TYPE, LOCATION NAME and BIAS DESCRIPTION
 # are redundant, many categories can be combined into the top 10-15 for each column

In [None]:
# political climate csv
political = pd.read_csv('political_climate.csv')
political

### Preprocessing

In [None]:
#dropping duplicate columns like state name and unnecessary columns like Agency Name
hate_crime = hate_crime.drop(['STATE_NAME', 'POPULATION_GROUP_DESC', 'PUB_AGENCY_UNIT', 
                              'ORI', 'PUB_AGENCY_NAME', 'AGENCY_TYPE_NAME', 
                              'TOTAL_INDIVIDUAL_VICTIMS','DIVISION_NAME', 'INCIDENT_ID', 
                              'MULTIPLE_OFFENSE', 'MULTIPLE_BIAS'], axis=1)
hate_crime.head(3)

In [None]:
#checking null values

percent_missing = hate_crime.isnull().sum() *100/len(hate_crime)
missing_values_df = pd.DataFrame({'column_name': hate_crime.columns, 'percent_missing': percent_missing})
missing_values_df.sort_values('percent_missing', inplace = True)
display(missing_values_df)

#drop columns with more than 70% missing values
perc = 70.0
min_count = int(((100-perc)/100)*hate_crime.shape[0]+1)
hate_crime = hate_crime.dropna(axis=1, thresh=min_count)

#checking remaining missing values
hate_crime.isnull().sum()

In [None]:
# replace null values in OFFENDER RACE column

#view unique values
unique_race_cat = hate_crime['OFFENDER_RACE'].unique()
print(unique_race_cat)

#replace nan with unknown label
hate_crime['OFFENDER_RACE'] = hate_crime['OFFENDER_RACE'].replace(np.nan, 'Unknown')
hate_crime['OFFENDER_RACE'].unique()

In [None]:
#verifying no missing values
hate_crime.isnull().sum()

### Transforming Datatypes

In [None]:
#convert to datetime
hate_crime["INCIDENT_DATE"] = pd.to_datetime(hate_crime["INCIDENT_DATE"])
hate_crime.dtypes

### Reducing Categories

In [None]:
# reducing the number of categories for the following:
print(hate_crime['VICTIM_TYPES'].unique())
print()
print(hate_crime['LOCATION_NAME'].unique())
print()
print(hate_crime['BIAS_DESC'].unique())

In [None]:
#reduce the number of categories for VICTIM_TYPES by condensing labels
replacements = {'VICTIM_TYPES':{r'.*Law Enforcement Officer.*':'Law Enforcement Officer', 
                                r'.*Religious Organization.*': 'Religious Organization', 
                                r'.*Business.*': 'Business', 
                                r'.*Government.*': 'Government', 
                                r'.*Individual.*': 'Individual', 
                                r'.*Society/Public.*':'Society/Public'}}
hate_crime.replace(replacements, regex=True, inplace=True)

hate_crime['VICTIM_TYPES'].value_counts()

In [None]:
#reduce the number of categories for LOCATION_NAME by condensing labels
replacements = {'LOCATION_NAME':{r'.*Highway/Road/Alley/Street/Sidewalk.*':'Highway/Road/Alley/Street/Sidewalk', 
                                 r'.*College.*': 'School-College/University', 
                                 r'.*Residence/Home.*': 'Residence/Home',
                                 r'.*Drug Store/Doctor.*': 'Drug Store/Doctor', 
                                 r'.*Commercial/Office Building.*': 'Commercial/Office Building',
                                 r'.*Restaurant.*': 'Restaurant', 
                                 r'.*Government/Public Building.*': 'Government/Public Building',
                                 r'.*Grocery/Supermarket.*': 'Grocery/Supermarket',
                                 r'.*Parking/Drop Lot/Garage.*': 'Parking/Drop Lot/Garage',
                                 r'.*Jail/Prison/Penitentiary/Corrections Facility.*': 'Jail/Prison/Penitentiary/Corrections Facility',  
                                 r'.*School-Elementary/Secondary.*': 'School-Elementary/Secondary', 
                                 r'.*Church/Synagogue/Temple/Mosque.*': 'Church/Synagogue/Temple/Mosque', 
                                 r'.*Amusement Park.*': 'Amusement Park',
                                 r'.*Bar/Nightclub.*': 'Bar/Nightclub',
                                 r'.*Air/Bus/Train Terminal.*': 'Air/Bus/Train Terminal',
                                 r'.*Department/Discount Store.*': 'Department/Discount Store',
                                 r'.*Auto Dealership New/Used.*': 'Auto Dealership New/Used'
                                }}
hate_crime.replace(replacements, regex=True, inplace=True)

hate_crime['LOCATION_NAME'].value_counts()

In [None]:
#reduce the number of categories for BIAS_DESC by condensing labels
replacements = {'BIAS_DESC':{r'.*Anti-Black.*':'Anti-Black or African American', 
                             r'.*Anti-Jewish.*': 'Anti-Jewish', 
                             r'.*Anti-Gay.*': 'Anti-Gay (Male)',
                             r'.*Anti-Lesbian.*': 'Anti-Lesbian (Female)', 
                             r'.*Anti-Islamic.*': 'Anti-Islamic (Muslim)',
                             r'.*Anti-Hispanic.*': 'Anti-Hispanic or Latino',
                             r'.*Anti-Transgender.*': 'Anti-Transgender', 
                             r'.*Anti-Gender Non-Conforming.*': 'Anti-Gender Non-Conforming',
                             r'.*Anti-Asian.*': 'Anti-Asian',
                             r'.*Anti-Bisexual,*':'Anti-Bisexual',
                             r'.*Anti-American Indian.*': 'Anti-Native American',
                             r'.*Anti-Mental Disability.*': 'Anti-Mental Disability',
                             r'.*Anti-Physical Disability.*': 'Anti-Physical Disability',
                             r'.*Anti-Other Religion.*': 'Anti-Other Religion', 
                             r'.*Anti-Multiple Races, Group.*': 'Anti-Multiple Races, Group', 
                             r'.*Anti-Hindu.*': 'Anti-Hindu', 
                             r'.*Anti-Catholic.*': 'Anti-Catholic', 
                             r'.*Anti-Arab.*': 'Anti-Arab', 
                             r'.*Anti-Jehovah.*': 'Anti-Jehovahs Witness', 
                             r'.*Anti-White.*': 'Anti-White',
                             r'.*Anti-Multiple Religions.*': 'Anti-Multiple Religions',
                             r'.*Anti-Protestant.*': 'Anti-Protestant',
                             r'.*Anti-Native Hawaiian.*': 'Anti-Native Hawaiian or Other Pacific Islander',
                             r'.*Anti-Bisexual.*': 'Anti-Bisexual', 
                             r'.*Anti-Female.*': 'Anti-Female'
                            }}
hate_crime.replace(replacements, regex=True, inplace=True)

hate_crime['BIAS_DESC'].value_counts()

### Categorical Encoding

In [None]:
# To be encoded:
# STATE_ABBR, REGION_NAME, POPULATION_GROUP_CODE, OFFENDER_RACE, 
# OFFENSE_NAME, LOCATION_NAME, BIAS_DESC, VICTIM_TYPES

In [None]:
for col in hate_crime:
    if (hate_crime[col].dtypes) == object:
        print(col, hate_crime[col].dtypes)

In [None]:
for col in hate_crime:
    if (hate_crime[col].dtypes) == object:
        hate_crime[col] = hate_crime[col].astype('category')
        hate_crime[col] = hate_crime[col].cat.codes     

In [None]:
hate_crime.dtypes

### Normalizing

In [None]:
hate_crime.describe()

In [None]:
#hate_crime_backup_df = hate_crime

In [None]:
#hate_crime.drop('INCIDENT_DATE', axis=1, inplace=True)

In [None]:
#normalized = Normalizer().fit_transform(hate_crime)

#turn the output array into a dataframe
#normalized_df = pd.DataFrame(normalized, columns = hate_crime_backup_df.columns)
#normalized_df

In [None]:
#normalized_df.describe()