# This is an analysis of NYC OpenData's "Inmate Discharges" dataset. 
### Disclaimer: Does not included Sealed cases. 
dataset can be found here: https://data.cityofnewyork.us/Public-Safety/Inmate-Discharges/94ri-3ium

help with querying: https://support.socrata.com/hc/en-us/articles/202949268-How-to-query-more-than-1000-rows-of-a-dataset ; https://docs.python-requests.org/en/latest/

My goal is to predict how long inmates are held based on the other features in the dataset. I will define "Time Held" as (DISCHARGED_DT – ADMITTED_DT)

Normalizes the data, tries different models on it. 

In [8]:
'''
Note: inmate_status_code meanings
    CS= City Sentenced
    CSP= City Sentenced - with VP Warrant
    DE= Detainee
    DEP= Detainee - with Open Case & VP Warrant
    DNS= Detainee- Newly Sentenced to State Time
    DPV= Detainee- Technical Parole Violator
    SCO= State Prisoner- Court Order
    SSR= State Ready
'''

'\nNote: inmate_status_code meanings\n    CS= City Sentenced\n    CSP= City Sentenced - with VP Warrant\n    DE= Detainee\n    DEP= Detainee - with Open Case & VP Warrant\n    DNS= Detainee- Newly Sentenced to State Time\n    DPV= Detainee- Technical Parole Violator\n    SCO= State Prisoner- Court Order\n    SSR= State Ready\n'

In [9]:
'''Building a Pipeline'''

# one hot encode categorical variables
# do that in the pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# get the data through the API
import pandas as pd
import requests

# takes url of the dataset, returns a Pandas DataFrame of the dataset
def getNYCdata(url):
    key = 'g2ZZTkxY9ZQEVVddSnAwJvTdO' # TO DO: this should be secret
    response = requests.get(url, headers={'X-App-Token': key})
    data = response.text
    df = pd.read_json(data)
    return df
    
# Inmate Discharges
url = 'https://data.cityofnewyork.us/resource/94ri-3ium.json?$limit=50000&$offset=50000'
discharges = getNYCdata(url)

# create time held metric
discharges['time_held'] = pd.to_datetime(discharges['discharged_dt'])-pd.to_datetime(discharges['admitted_dt'])

# drop columns that I don't need
discharges = discharges.drop('top_charge', axis=1)
discharges = discharges.dropna()
# drop duplicates
discharges = discharges.drop_duplicates(subset=['inmateid'])

discharges = discharges.set_index('inmateid')
# discharges = discharges.drop('inmateid', axis=1)

# add columns with time of day, time of week admitted/discharged
discharges['admitted_dt'] = pd.to_datetime(discharges['admitted_dt'])
discharges['discharged_dt'] = pd.to_datetime(discharges['discharged_dt'])

discharges['admitted_hour'] = discharges['admitted_dt'].dt.hour
discharges['discharged_hour'] = discharges['discharged_dt'].dt.hour

discharges['admitted_dayofweek'] = discharges['admitted_dt'].dt.dayofweek
discharges['discharged_dayofweek'] = discharges['discharged_dt'].dt.dayofweek

# one-hot encode the categorical variables

#cat_cols = ['gender', 'inmate_status_code']
gender_dummies = discharges['gender'].str.get_dummies()
inmate_status_dummies = discharges['inmate_status_code'].str.get_dummies()
race_dummies = discharges['race'].str.get_dummies()

# use pd.concat to join the new columns with your original dataframe
discharges = pd.concat([discharges,pd.get_dummies(discharges['gender'], prefix='gender')],axis=1)
discharges = pd.concat([discharges,pd.get_dummies(discharges['race'], prefix='race')],axis=1)
discharges = pd.concat([discharges,pd.get_dummies(discharges['inmate_status_code'], prefix='inmate_status')],axis=1)

# drop the non-encoded columns
discharges = discharges.drop(['race','gender','inmate_status_code'], axis=1)


### TO DO: process categorical variables this way eventually

'''
# preprocess categorical data
categorical_cols = ['inmate_status_code']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'), drop='first')
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ])
    
# drop certain columns from analysis - ['admitted_dt','discharged_dt', 'inmateid', 
'discharged_dayofweek','discharged_hour', 'gender_F ', 'gender_M ','inmate_status_CSP', 
'inmate_status_CSP', 'inmate_status_DNS', 'race_BLACK', 'race_UNKNOWN', 'race_ASIAN', 
'inmate_status_DEP', 'inmate_status_SSR']
'''

"\n# preprocess categorical data\ncategorical_cols = ['inmate_status_code']\ncategorical_transformer = Pipeline(steps=[\n    ('onehot', OneHotEncoder(handle_unknown='ignore'), drop='first')\n])\n\n# Bundle preprocessing for numerical and categorical data\npreprocessor = ColumnTransformer(\n    transformers=[\n        ('cat', categorical_transformer, categorical_cols)\n    ])\n    \n# drop certain columns from analysis - ['admitted_dt','discharged_dt', 'inmateid', \n'discharged_dayofweek','discharged_hour', 'gender_F ', 'gender_M ','inmate_status_CSP', \n'inmate_status_CSP', 'inmate_status_DNS', 'race_BLACK', 'race_UNKNOWN', 'race_ASIAN', \n'inmate_status_DEP', 'inmate_status_SSR']\n"

In [10]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y)

mi_scores

NameError: name 'X' is not defined

In [11]:
# NOW start modeling
'''Building a Pipeline - Not using ColumnTransformer b/c it does not help me'''

# one hot encode categorical variables
# do that in the pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# get the data through the API
import pandas as pd
import requests

# takes url of the dataset, returns a Pandas DataFrame of the dataset
def getNYCdata(url):
    key = 'g2ZZTkxY9ZQEVVddSnAwJvTdO' # TO DO: this should be secret
    response = requests.get(url, headers={'X-App-Token': key})
    data = response.text
    df = pd.read_json(data)
    return df
    
# Inmate Discharges
url = 'https://data.cityofnewyork.us/resource/94ri-3ium.json?$limit=50000&$offset=50000'
discharges = getNYCdata(url)

# create time held metric
discharges['time_held'] = pd.to_datetime(discharges['discharged_dt'])-pd.to_datetime(discharges['admitted_dt'])


discharges = discharges.set_index('inmateid')
# discharges = discharges.drop('inmateid', axis=1)

# add columns with time of day, time of week admitted
discharges['admitted_dt'] = pd.to_datetime(discharges['admitted_dt'])
discharges['admitted_hour'] = discharges['admitted_dt'].dt.hour
discharges['admitted_dayofweek'] = discharges['admitted_dt'].dt.dayofweek
discharges = discharges.reset_index()

# drop duplicates
discharges = discharges.drop_duplicates(subset=['inmateid'])

# drop columns that I don't need and the nulls
discharges = discharges.drop(['top_charge','race','gender', 
                              'admitted_dt','discharged_dt', 'inmateid'], axis=1)
discharges = discharges.dropna()

# one-hot encode the categorical variables

#cat_cols = ['gender', 'inmate_status_code']
inmate_status_dummies = discharges['inmate_status_code'].str.get_dummies()

# use pd.concat to join the new columns with your original dataframe
discharges = pd.concat([discharges,pd.get_dummies(discharges['inmate_status_code'], prefix='inmate_status')],axis=1)

# drop the inmate_status_codes that I'm not using
discharges = discharges.drop(['inmate_status_CSP', 'inmate_status_CSP', 'inmate_status_DNS',
                              'inmate_status_DEP', 'inmate_status_SSR', 'inmate_status_code'], axis=1)


In [12]:
# fit the models
from sklearn.model_selection import train_test_split

X = discharges.copy()
y = X.pop('time_held')

# clean up types
X['age'] = X['age'].astype('int64')
y = y.apply(lambda x: x.value)

rs = 302

# Normalization
#X=(X-np.min(X))/(np.max(X) - np.min(X))

from sklearn import preprocessing
 
scaler = preprocessing.StandardScaler().fit(X)   # X is an array with all our features
X = scaler.transform(X)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=rs)

# Random Forest, Decision Tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score


from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_valid, y_valid, random_state, model):
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

def printCVScore(scores):
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    

A mean error squared:
'time_held' is in nanoseconds, so MAE of 5721308068400939.0 means the estimate was about a week off

In [9]:
from sklearn.linear_model import LinearRegression

# Linear Regression
lr = LinearRegression()
lr.fit(X_train,y_train)

# RandomForestc
rf = RandomForestRegressor(n_estimators=100, random_state=rs)
rf.fit(X_train, y_train)

# SVM
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)

# Naive Bayes
#from sklearn.naive_bayes import GaussianNB
#gnb = GaussianNB
#gnb.fit(X_train, y_train)

TypeError: fit() missing 1 required positional argument: 'y'

In [14]:
# SVM
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=3)

In [None]:
scores = []
maes = []
for model in [lr, rf, svc, knn]:
    print("next one")
    mae = score_dataset(X_test, y_test,rs, model)
    maes.append(mae)
    cvscores = cross_val_score(model, X, y, cv=5)
    scores.append(cvscores)
    printCVScore(cvscores)

next one


So this means that the data got predicted correctly about 23 percent of the time? I'm still not sure

array([0.21910657, 0.23074301, 0.2204047 , 0.23948893, 0.27446025])

Thank you
https://stackoverflow.com/questions/52611498/need-help-understanding-cross-val-score-in-sklearn-python

In [None]:
def printCVScore(scores):
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
printCVScore(scores)