In [20]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

def propensity_scores_florida(fpath):
    
    """
    Calculated the propensity scores for being searched, being cited and being arrested for traffic stops.
    Returns a list of propensity scores as follows:
        [p(searched),p(cited),p(arrested)]
   """
    
    ps_cols = ['county_name','subject_age','subject_sex','violation','arrest_made','citation_issued','search_conducted']
    
    fl = pd.read_csv(fpath)[ps_cols]
    fl_valid_stops = fl.dropna(subset= ps_cols).reset_index(drop=True)
    
    # Change categorical columns to boolean, ohe encoded columns
    fl_valid_stops['subject_sex'] = fl_valid_stops['subject_sex'].apply(lambda sex: {'female': 0, 'male': 1}.get(sex, ' '))
    bool_cols = ['arrest_made','citation_issued','search_conducted']
    for col in bool_cols:
        fl_valid_stops[col] = fl_valid_stops[col].apply(lambda x: 1 if x else 0)
    fl_ohe = pd.get_dummies(fl_valid_stops,prefix=['county','violation'],columns=['county_name','violation'])

    # Logistic Regression to compute Propensity scores
    lr = LogisticRegression(n_jobs=1,solver='liblinear')
    
    # Probabiltiy searched 
    X = fl_ohe.drop(['arrest_made','citation_issued','search_conducted'],axis=1)
    Y = fl_ohe['search_conducted']
    lr.fit(X,Y)
    probs_searched = [x[1] for x in lr.predict_proba(X)]

    # Probabiltiy cited 
    X = fl_ohe.drop(['arrest_made','citation_issued'],axis=1)
    Y = fl_ohe['citation_issued']
    lr.fit(X,Y)
    probs_cited = [x[1] for x in lr.predict_proba(X)]

    # Probabiltiy arrested 
    X = fl_ohe.drop(['arrest_made','citation_issued'],axis=1)
    Y = fl_ohe['arrest_made']
    lr.fit(X,Y)
    probs_arrested = [x[1] for x in lr.predict_proba(X)]
    
    propensity_scores = list(zip(probs_searched,probs_cited,probs_arrested))
    return propensity_scores


def propensity_scores_sc(fpath):
    
    """
    Calculated the propensity scores for being searched, being cited and being arrested for traffic stops.
    Returns a list of propensity scores as follows:
        [p(searched),p(cited),p(arrested)]
    """ 
    ps_cols = ['county_name','subject_age','subject_sex','violation','arrest_made','citation_issued','search_conducted']

    sc = pd.read_csv(fpath)[ps_cols]
    # fill violation as 'None' if no search, citation or arrest was made
    sc['violation'] = sc['violation'].fillna('None')
    sc_valid_stops = sc.dropna(subset = ps_cols).reset_index(drop=True)

    # Change categorical columns to boolean, ohe encoded columns
    sc_valid_stops['subject_sex'] = sc_valid_stops['subject_sex'].apply(lambda sex: {'female': 0, 'male': 1}.get(sex, ' '))
    bool_cols = ['arrest_made','citation_issued','search_conducted']
    for col in bool_cols:
        sc_valid_stops[col] = sc_valid_stops[col].apply(lambda x: 1 if x else 0)
    sc_ohe = pd.get_dummies(sc_valid_stops,prefix=['county','violation'],columns=['county_name','violation'])

    # Logistic Regression to compute Propensity scores
    lr = LogisticRegression(n_jobs=1,solver='liblinear')
    
    # Probabiltiy searched 
    X = sc_ohe.drop(['arrest_made','citation_issued','search_conducted'],axis=1)
    Y = sc_ohe['search_conducted']
    lr.fit(X,Y)
    probs_searched = [x[1] for x in lr.predict_proba(X)]

    # Probabiltiy cited 
    X = sc_ohe.drop(['arrest_made','citation_issued'],axis=1)
    Y = sc_ohe['citation_issued']
    lr.fit(X,Y)
    probs_cited = [x[1] for x in lr.predict_proba(X)]

    # Probabiltiy arrested 
    X = sc_ohe.drop(['arrest_made','citation_issued'],axis=1)
    Y = sc_ohe['arrest_made']
    lr.fit(X,Y)
    probs_arrested = [x[1] for x in lr.predict_proba(X)]
    
    propensity_scores = list(zip(probs_searched,probs_cited,probs_arrested))
    return propensity_scores