In [40]:
import re
import pandas as pd
import pickle
import json
import numpy as np
import scipy.linalg as la
import statsmodels.api as sm
from sklearn import linear_model, model_selection, metrics
import sklearn
import plotly.graph_objs as go
import matplotlib.pyplot as plt

In [2]:
import sys
np.set_printoptions(threshold=sys.maxsize)

In [3]:
df = pd.read_csv('Sentenced_Inmates_in_Correctional_Facilities.csv')

In [4]:
df.columns

Index(['DOWNLOAD DATE', 'IDENTIFIER', 'LATEST ADMISSION DATE', 'RACE',
       'GENDER', 'AGE', 'END SENTENCE DATE', 'OFFENSE', 'FACILITY', 'DETAINER',
       'SENTENCE DAYS',
       'SPECIAL PAROLE END DATE                                                        '],
      dtype='object')

In [41]:
# df

In [None]:
df['LATEST ADMISSION DATE'] = pd.to_datetime(df['LATEST ADMISSION DATE'])
# df['END SENTENCE DATE'] = pd.to_datetime(df['END SENTENCE DATE'])
mask = df['END SENTENCE DATE'] is np.nan
esd = 'END SENTENCE DATE'
nesd = 'N END SENTENCE DATE'
lad = 'LATEST ADMISSION DATE'
sd = 'SENTENCE DAYS'
new_end = np.empty_like(df[esd])
for i in range(len(new_end)):
    to_add = float(df.iloc[i][sd])
    if to_add / 365 > 300:
        to_add = 300.
    new_end[i] = df.iloc[i][lad] + pd.DateOffset(days=to_add)
df[esd] = new_end

In [None]:
race = pd.get_dummies(df.RACE)
race.WHITE = race.WHITE + race['WHITE   ']
race.drop('WHITE   ', axis=1, inplace=True)
race.drop('WHITE', axis=1, inplace=True)

gender = pd.get_dummies(df.GENDER).drop('F', axis=1)

offence = pd.get_dummies(df.OFFENSE)

# becasue of bad encoding I need to clean these variables
offence['53A021'] = offence['53A021'] + offence['53A021                                  ']
offence['ALTERNATE PENALTY FOR DRUG SALES'] = offence['ALTERNATE PENALTY FOR DRUG SALES        '] + offence['ALTERNATE PENALTY FOR DRUG SALES']
offence['APPLICATION TO PURCHASE A FIREARM'] = offence['APPLICATION TO PURCHASE A FIREARM'] + offence['APPLICATION TO PURCHASE A FIREARM       ']
offence['CARRY PIST/RVOLV W/O PERMIT'] = offence['CARRY PIST/RVOLV W/O PERMIT'] + offence['CARRY PIST/RVOLV W/O PERMIT             ']
offence['CARRYING OR SALE OF DANGEROUS WEAPON'] = offence['CARRYING OR SALE OF DANGEROUS WEAPON'] + offence['CARRYING OR SALE OF DANGEROUS WEAPON    ']
offence['CHEATING AT GAMBLING'] = offence['CHEATING AT GAMBLING'] + offence['CHEATING AT GAMBLING                    ']
offence['CONSPIRACY'] = offence['CONSPIRACY'] + offence['CONSPIRACY                              ']
offence['CREDIT CARD THEFT'] = offence['CREDIT CARD THEFT'] + offence['CREDIT CARD THEFT                       ']
offence['CRIMINAL ATTEMPT'] = offence['CRIMINAL ATTEMPT'] + offence['CRIMINAL ATTEMPT                        ']
offence['CRIMINAL LIABILITY FOR ANOTHER PERSON'] = offence['CRIMINAL LIABILITY FOR ANOTHER PERSON'] + offence['CRIMINAL LIABILITY FOR ANOTHER PERSON   ']
offence['CRUELTY TO ANIMALS'] = offence['CRUELTY TO ANIMALS'] + offence['CRUELTY TO ANIMALS                      ']
offence['CRUELTY TO PERSONS'] = offence['CRUELTY TO PERSONS'] + offence['CRUELTY TO PERSONS                      ']
offence['DRIVING WHILE LICENSE SUSP FOR DWI'] = offence['DRIVING WHILE LICENSE SUSP FOR DWI'] + offence['DRIVING WHILE LICENSE SUSP FOR DWI      ']
offence['DRUGS NEAR PROBIBITED PLACE'] = offence['DRUGS NEAR PROBIBITED PLACE'] + offence['DRUGS NEAR PROBIBITED PLACE             ']
offence['ENTICING A MINOR'] = offence['ENTICING A MINOR'] + offence['ENTICING A MINOR                        ']
offence['ESCAPE FROM CUSTODY'] = offence['ESCAPE FROM CUSTODY'] + offence['ESCAPE FROM CUSTODY                     ']
offence['FAILURE TO OBEY AN OFFICER'] = offence['FAILURE TO OBEY AN OFFICER'] + offence['FAILURE TO OBEY AN OFFICER              ']
offence['FALSELY REPORTING INCIDENT'] = offence['FALSELY REPORTING INCIDENT'] + offence['FALSELY REPORTING INCIDENT              ']
offence['HOME INVASION'] = offence['HOME INVASION'] + offence['HOME INVASION                           ']
offence['ILLEGAL USE OF CREDIT CARD'] = offence['ILLEGAL USE OF CREDIT CARD'] + offence['ILLEGAL USE OF CREDIT CARD              ']
offence['INTERFERENCE WITH SEARCH'] = offence['INTERFERENCE WITH SEARCH'] + offence['INTERFERENCE WITH SEARCH                ']
offence['INTERSTATE CUSTODY COMPACT'] = offence['INTERSTATE CUSTODY COMPACT'] + offence['INTERSTATE CUSTODY COMPACT              ']
offence['ISSUING A BAD CHECK'] = offence['ISSUING A BAD CHECK'] + offence['ISSUING A BAD CHECK                     ']
offence['LARCENY DEFINED'] = offence['LARCENY DEFINED'] + offence['LARCENY DEFINED                         ']
offence['OPERATE UNDER INFLU OF LIQ OR DRUGS'] = offence['OPERATE UNDER INFLU OF LIQ OR DRUGS'] + offence['OPERATE UNDER INFLU OF LIQ OR DRUGS     ']
offence['POSS OF ASSAULT WEAPON PROHIBITED'] = offence['POSS OF ASSAULT WEAPON PROHIBITED'] + offence['POSS OF ASSAULT WEAPON PROHIBITED       ']
offence['POSSESS OF LT 4 OZ MJ OR CONTROLD SUB'] = offence['POSSESS OF LT 4 OZ MJ OR CONTROLD SUB'] + offence['POSSESS OF LT 4 OZ MJ OR CONTROLD SUB   ']
offence['POSSESSION OF NARCOTICS'] = offence['POSSESSION OF NARCOTICS'] + offence['POSSESSION OF NARCOTICS                 ']
offence['PROHIB ACTS RE: CONTROLLED SUBSTANCES'] = offence['PROHIB ACTS RE: CONTROLLED SUBSTANCES'] + offence['PROHIB ACTS RE: CONTROLLED SUBSTANCES   ']
offence['PROHIBITED ACTIVITIES'] = offence['PROHIBITED ACTIVITIES'] + offence['PROHIBITED ACTIVITIES                   ']
offence['REGULATION OF RESTRICTED SUBSTANCES'] = offence['REGULATION OF RESTRICTED SUBSTANCES'] + offence['REGULATION OF RESTRICTED SUBSTANCES     ']
offence['ROBBERY BY CARJACKING'] = offence['ROBBERY BY CARJACKING'] + offence['ROBBERY BY CARJACKING                   ']
offence['ROBBERY, FIRST DEGREE                 BF'] = offence['ROBBERY, FIRST DEGREE                 BF'] + offence['ROBBERY, FIRST DEGREE                BF'] + offence['ROBBERY, FIRST DEGREE                BF ']
offence['SALE OF HEROIN, COC BY NON-DEPENDENT'] = offence['SALE OF HEROIN, COC BY NON-DEPENDENT'] + offence['SALE OF HEROIN, COC BY NON-DEPENDENT    ']
offence['SALE, CARRY & BRAND OF FACSIMILE'] = offence['SALE, CARRY & BRAND OF FACSIMILE'] + offence['SALE, CARRY & BRAND OF FACSIMILE        ']
offence['STRANGULATION 3RD DEGREE'] = offence['STRANGULATION 3RD DEGREE'] + offence['STRANGULATION 3RD DEGREE                ']
offence['USING MOTOR VEHICLE W/O PERMISSION'] = offence['USING MOTOR VEHICLE W/O PERMISSION'] + offence['USING MOTOR VEHICLE W/O PERMISSION      ']
offence['VIOLATION OF CONDITIONS OF RELEASE'] = offence['VIOLATION OF CONDITIONS OF RELEASE'] + offence['VIOLATION OF CONDITIONS OF RELEASE      ']
offence['VIOLATION OF PROBATION OR COND DISCHG'] = offence['VIOLATION OF PROBATION OR COND DISCHG'] + offence['VIOLATION OF PROBATION OR COND DISCHG   ']
offence['VOYEURISM'] = offence['VOYEURISM'] + offence['VOYEURISM                               ']
offence['YOUTHFUL OFFENDER'] = offence['YOUTHFUL OFFENDER'] + offence['YOUTHFUL OFFENDER                       ']

# now we drop all the excess categories 
# AND one of the categories to fix linear dependence
offence.drop(
    [
        '53A021', # this is the dependence one
        'ALTERNATE PENALTY FOR DRUG SALES        ',
        'APPLICATION TO PURCHASE A FIREARM       ',
        'CARRY PIST/RVOLV W/O PERMIT             ',
        'CARRYING OR SALE OF DANGEROUS WEAPON    ',
        'CHEATING AT GAMBLING                    ',
        'CONSPIRACY                              ',
        'CREDIT CARD THEFT                       ',
        'CRIMINAL ATTEMPT                        ',
        'CRIMINAL LIABILITY FOR ANOTHER PERSON   ',
        'CRUELTY TO ANIMALS                      ',
        'CRUELTY TO PERSONS                      ',
        'DRIVING WHILE LICENSE SUSP FOR DWI      ',
        'DRUGS NEAR PROBIBITED PLACE             ',
        'ENTICING A MINOR                        ',
        'ESCAPE FROM CUSTODY                     ',
        'FAILURE TO OBEY AN OFFICER              ',
        'FALSELY REPORTING INCIDENT              ',
        'HOME INVASION                           ',
        'ILLEGAL USE OF CREDIT CARD              ',
        'INTERFERENCE WITH SEARCH                ',
        'INTERSTATE CUSTODY COMPACT              ',
        'ISSUING A BAD CHECK                     ',
        'LARCENY DEFINED                         ',
        'OPERATE UNDER INFLU OF LIQ OR DRUGS     ',
        'POSS OF ASSAULT WEAPON PROHIBITED       ',
        'POSSESS OF LT 4 OZ MJ OR CONTROLD SUB   ',
        'POSSESSION OF NARCOTICS                 ',
        'PROHIB ACTS RE: CONTROLLED SUBSTANCES   ',
        'PROHIBITED ACTIVITIES                   ',
        'REGULATION OF RESTRICTED SUBSTANCES     ',
        'ROBBERY BY CARJACKING                   ',
        'ROBBERY, FIRST DEGREE                BF',
        'ROBBERY, FIRST DEGREE                BF ',
        'SALE OF HEROIN, COC BY NON-DEPENDENT    ',
        'SALE, CARRY & BRAND OF FACSIMILE        ',
        'STRANGULATION 3RD DEGREE                ',
        'USING MOTOR VEHICLE W/O PERMISSION      ',
        'VIOLATION OF CONDITIONS OF RELEASE      ',
        'VIOLATION OF PROBATION OR COND DISCHG   ',
        'VOYEURISM                               ',
        'YOUTHFUL OFFENDER                       '
    ], axis=1, inplace=True
)

In [None]:
# for i,item in enumerate(offence.columns):
# #     print(item)
#     if i == len(offence.columns)-1:
#         continue
#     if item == offence.columns[i+1][:len(item)]:
#         print(item)
# #     print(item == offence.columns[i+1][:len(item)])