In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, cross_val_score

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import (accuracy_score, 
                            confusion_matrix, 
                            classification_report,
                            f1_score,
                            plot_confusion_matrix,
                            precision_recall_curve,
                            precision_score,
                            recall_score)

In [2]:
pwd

'/Users/emilynaftalin/Data_Science/General Assembly/dsi/projects/Mass-Protests/users'

In [3]:
mass = pd.read_csv('../data/mass_mobile.csv')

In [4]:
mass.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
mass.head(2)

Unnamed: 0,id,country,ccode,region,protestnumber,protesterviolence,location,protesteridentity,sources,notes,...,social_restrictions,start_date,end_date,target,notes_clean,neg,neu,pos,compound,protest_duration
0,201990001,Canada,20,North America,1,0.0,national,unspecified,1. great canadian train journeys into history;...,canada s railway passenger system was finally ...,...,0,1990-01-15,1990-01-15,"[0, 0, 0, 0, 1, 0, 0]",canada s railway passenger system was finally ...,0.087,0.913,0.0,-0.8176,1
1,201990002,Canada,20,North America,2,0.0,"Montreal, Quebec",unspecified,1. autonomy s cry revived in quebec the new yo...,protestors were only identified as young peopl...,...,0,1990-06-25,1990-06-25,"[0, 0, 0, 0, 1, 0, 0]",protestors were only identified as young peopl...,0.0,1.0,0.0,0.0,1


In [6]:
# sorted(list(mass.columns))

In [7]:
# mass.info(verbose=True)

In [8]:
def evaluate_model(y_true, y_preds):
    
    '''
    DOCSTRING HERE
    '''
    
    f1 = f1_score(y_true, y_preds)
    
    precision = precision_score(y_true, y_preds)
    
    recall = recall_score(y_true, y_preds)

    # add accuracy

    return f1, precision, recall

In [9]:
# def build_confusion_matrix(y_true, y_preds, title):
    
#     tn, fp, fn, tp = confusion_matrix(y_true, y_preds).ravel()
#     disp = plot_confusion_matrix(model, 
#                                  X_test_sc, 
#                                  y_test, 
# #                                  display_labels=
#                                  cmap="Blues")
    
#     disp.ax_.set_title(title)
#     print(title)
#     print(disp.confusion_matrix)
    
#     plt.show
    
#     return

In [10]:
def response_prediction_columns(model, df, features):
    
    ss = StandardScaler()
    
    X = features
    X_sc = ss.fit_transform(X)
    
    predicted_column = model.predict(X_sc)
    probability_column = model.predict_proba(X_sc)[:,1]
    
    return predicted_column, probability_column

In [11]:
def state_response_predictor(df, features, target, model):
    
    X = features 
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

    ss = StandardScaler()

    X_train_sc = ss.fit_transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    model.fit(X_train_sc, y_train)
    
    y_preds_train = model.predict(X_train_sc)
    y_preds_test = model.predict(X_test_sc)
    
    f1_score_train, precision_train, recall_train = evaluate_model(y_train, y_preds_train)
    f1_score_test, precision_test, recall_test = evaluate_model(y_test, y_preds_test)
    
#     print(f'{target}- F1_score for {model} model, train set: {f1_score_train}')
#     print(f'{target}- F1_score for {model} model, test set: {f1_score_test}')
#     print(f'{target}- Precision for {model} model, train set: {precision_train}')
#     print(f'{target}- Precision for {model} model, test set: {precision_test}')
#     print(f'{target}- Recall for {model} model, train set: {recall_train}')
#     print(f'{target}- Recall for {model} model, teset set: {recall_test}')
    
    predicted_column, probability_column = response_prediction_columns(model, df, features)
    
#     print(model)

#     build_confusion_matrix(y_test, y_preds_test)

    return predicted_column, probability_column, f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test  

Creating two new columns that consolidate the three violent state responses -- `beatings, shootings, killings`.
+ `violent_response`: indicates whether or not a violdnt state response was used. (0 for no, 1 for yes)
+ `violent_count`: indicates number (0,1,2,3) of violent ressponses that were recorded. (I anticipate this will be used just to judge collinearity/dependence.)

_NB: Not yet dropping the original three columns._

In [12]:
mass['violent_response'] = 0 
# mass.loc[ (mass['beatings'] == 1 | mass['shootings'] == 1 | mass['killings'] == 1), 'violent_response' ] = 1
mass.loc[mass['beatings'] == 1, 'violent_response'] = 1
mass.loc[mass['shootings'] == 1, 'violent_response'] = 1
mass.loc[mass['killings'] == 1, 'violent_response'] = 1

In [13]:
mass['violent_count'] = mass['beatings'] + mass['shootings'] + mass['killings']

In [14]:
# checking whether the new column looks good 
mass[['beatings', 'shootings','killings', 'violent_response', 'violent_count']]

Unnamed: 0,beatings,shootings,killings,violent_response,violent_count
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0
...,...,...,...,...,...
15174,0,1,0,1,1
15175,0,1,1,1,2
15176,0,0,0,0,0
15177,0,0,0,0,0


In [15]:
mass[ mass['violent_count'] > 1][['beatings', 'shootings','killings', 'violent_response']].count()

beatings            604
shootings           604
killings            604
violent_response    604
dtype: int64

_Saving and pickling `mass` DataFrame that now has `violent_response` and `violent_count` columns. 

In [None]:
targets = ['arrests','accomodation','beatings','crowddispersal','ignore','killings','shootings', 'violent_response']  

grids = []

for target in targets:    
    
    features = mass.drop(columns=['country', 'ccode', 'region', 'location','protesteridentity', 'sources',
           'notes', 'protester_id_type', 'protest_size_category', 'start_date', 'notes_clean', 'neg', 'neu', 'pos',
           'end_date', 'target', 'arrests', 'accomodation', 'beatings',
           'crowddispersal', 'ignore', 'killings', 'shootings', 'violent_response'])
    
    X = features
    y = mass[target]
    
    logreg = LogisticRegression()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)
    
    ss = StandardScaler()
    
    X_train_sc = ss.fit_transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    params = {
        'penalty' : ['l1','l2'], 
        'C' : [.01, 1, 10],
        'solver' : ['liblinear', 'saga', 'lfgbs'],
        'max_iter' : [100, 1000]
    }
    
    gs = GridSearchCV(logreg, param_grid=params, cv=10, verbose=2)
    
    #print(y)
    grids.append(gs)

In [None]:
results = {
    'best_cv_scores' : [],
    'best_params' : [],
    'train_f1_scores' : [],
    'test_f1_scores' : [],
    'train_precision_scores' : [], 
    'test_precision_scores' : [],
    'train_recall_scores' : [],
    'test_recall_scores' : []
}

for grid in grids: 
    
    # fit grids
    grid.fit(X_train_sc, y_train)
    
    # append important grid results
    results['best_params'].append(grid.best_params_)
    results['best_cv_scores'].append(grid.best_score_)
    
    # make predictions 
    train_preds = grid.predict(X_train_sc)
    test_preds = grid.predict(X_test_sc)
    
    # various classification metrics
    # f1 scores
    results['train_f1_scores'].append(f1_score(y_train, train_preds))
    results['test_f1_scores'].append(f1_score(y_test, test_preds))
    
    # precision scores 
    results['train_precision_scores'].append(precision_score(y_train, train_preds))
    results['test_precision_scores'].append(precision_score(y_test, test_preds))
    
    # recall scores
    results['train_recall_scores'].append(recall_score(y_train, train_preds))
    results['test_recall_scores'].append(recall_score(y_test, test_preds))
    
results_df = pd.DataFrame(results)
results_df

In [191]:
# mass.to_csv('../data/03_mass_violent_response.csv', index=False)

In [193]:
# mass.to_pickle('../data/03_mass_violent_response.pickle')

In [195]:
features = mass.drop(columns=['country', 'ccode', 'region', 'location','protesteridentity', 'sources',
           'notes', 'protester_id_type', 'protest_size_category', 'start_date', 'notes_clean', 'neg', 'neu', 'pos',
           'end_date', 'target', 'arrests', 'accomodation', 'beatings', 'crowddispersal', 'ignore', 'killings', 'shootings', 
            'violent_response', 'violent_count'])

LogisticRegression_1 = LogisticRegression(solver='lbfgs', max_iter=1000)

In [196]:
# (mass.drop(columns=['country', 'ccode', 'region', 'location','protesteridentity', 'sources',
#            'notes', 'protester_id_type', 'protest_size_category', 'start_date', 'notes_clean', 'neg', 'neu', 'pos',
#            'end_date', 'target', 'arrests', 'accomodation', 'beatings',
#            'crowddispersal', 'ignore', 'killings', 'shootings'])).columns

**Adding `predicted` and `probability` columns for each each response to the `mass` DF using `state_response_predictor` function.** 

Responses: `'arrests','accomodation','beatings','crowddispersal','ignore','killings','shootings'`

In [197]:
mass['arrests_predicted'], mass['arrests_probability'], \
f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'arrests', LogisticRegression_1)

In [198]:
mass['accomodation_predicted'], mass['accomodation_probability'], \
f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'accomodation', LogisticRegression_1)

In [199]:
# looking at where accomodation is '1' and it was also correctly predicted 

mass[ (mass['accomodation'] == mass['accomodation_predicted']) & mass['accomodation']==1 ][['accomodation', 'accomodation_predicted', 'accomodation_probability']]

Unnamed: 0,accomodation,accomodation_predicted,accomodation_probability
451,1,1,0.921425
1438,1,1,0.609061
1469,1,1,0.550764
1679,1,1,0.596815
2061,1,1,0.540708
...,...,...,...
14736,1,1,0.609921
14737,1,1,0.840545
15133,1,1,0.524657
15149,1,1,0.531713


In [200]:
mass['beatings_predicted'], mass['beatings_probability'], f1_score_train, \
f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'beatings', LogisticRegression_1)

In [201]:
mass['crowddispersal_predicted'], mass['crowddispersal_probability'], \
f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'crowddispersal', LogisticRegression_1)

In [202]:
mass['ignore_predicted'], mass['ignore_probability'], \
f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'ignore', LogisticRegression_1)

In [203]:
mass['killings_predicted'], mass['killings_probability'], \
f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'killings', LogisticRegression_1)

In [204]:
mass['shootings_predicted'], mass['shootings_probability'], \
f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'shootings', LogisticRegression_1)

In [205]:
predictions_df = mass[['id','arrests', 'arrests_predicted', 'arrests_probability', 'accomodation', 'accomodation_predicted', 'accomodation_probability', 
      'beatings', 'beatings_predicted', 'beatings_probability', 'crowddispersal', 'crowddispersal_predicted', 'crowddispersal_probability', 
      'ignore', 'ignore_predicted', 'ignore_probability', 'killings', 'killings_predicted', 'killings_probability',
      'killings', 'killings_predicted', 'killings_probability']].copy()

In [206]:
predictions_df.head()

Unnamed: 0,id,arrests,arrests_predicted,arrests_probability,accomodation,accomodation_predicted,accomodation_probability,beatings,beatings_predicted,beatings_probability,...,crowddispersal_probability,ignore,ignore_predicted,ignore_probability,killings,killings_predicted,killings_probability,killings.1,killings_predicted.1,killings_probability.1
0,201990001,0,0,0.148069,0,0,0.12599,0,0,0.000192,...,0.084188,1,1,0.706858,0,0,0.004661,0,0,0.004661
1,201990002,0,0,0.132347,0,0,0.083107,0,0,0.000267,...,0.064624,1,1,0.749024,0,0,0.002015,0,0,0.002015
2,201990003,0,0,0.082134,0,0,0.041444,0,0,0.000125,...,0.042882,1,1,0.807295,0,0,0.000728,0,0,0.000728
3,201990004,0,1,0.585644,1,0,0.295309,0,0,0.000685,...,0.526179,0,0,0.06622,0,0,0.123478,0,0,0.123478
4,201990005,1,0,0.37017,1,0,0.090058,0,0,0.000518,...,0.376911,0,0,0.166469,0,0,0.020468,0,0,0.020468


In [207]:
# predictions_df.to_csv('../data/04_predictions_only.csv', index=False)

Adding `predicted` and `probability` columns for a violent response of any type (beatings/shootings/killigs). 

In [208]:
mass['violence_predicted'], mass['violence_probability'], \
f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'violent_response', LogisticRegression_1)

In [215]:
model_performance_dict = {
    'target':[],
    'majority_class':[],
    'majority_percent':[],
#     'model':[],
    'f1_score_train':[],
    'f1_score_test':[],
    'precision_train':[],
    'precision_test':[],
    'recall_train':[],
    'recall_test':[] 
}

 
targets = ['arrests','accomodation','beatings','crowddispersal','ignore','killings','shootings', 'violent_response']    
    
for target in targets:
    
    predicted_column, probability_column, f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = state_response_predictor(
                                                                                                                                mass, 
                                                                                                                                features, 
                                                                                                                                target, 
                                                                                                                                LogisticRegression_1)
    
    model_performance_dict['target'].append(target)
    model_performance_dict['majority_class'].append(mass[target].value_counts(normalize=True).idxmax())
    model_performance_dict['majority_percent'].append(max(mass[target].value_counts(normalize=True)))
#     model_performance_dict['model'].append(model)
    model_performance_dict['f1_score_train'].append(f1_score_train)
    model_performance_dict['f1_score_test'].append(f1_score_test)
    model_performance_dict['precision_train'].append(precision_train)
    model_performance_dict['precision_test'].append(precision_test)
    model_performance_dict['recall_train'].append(recall_train)
    model_performance_dict['recall_test'].append(recall_test)
    
print(model_performance_dict)
    
model_performance_df = pd.DataFrame(model_performance_dict)
model_performance_df.round(4)

{'target': ['arrests', 'accomodation', 'beatings', 'crowddispersal', 'ignore', 'killings', 'shootings', 'violent_response'], 'majority_class': [0, 0, 0, 0, 1, 0, 0, 0], 'majority_percent': [0.8588181039594176, 0.8997957704723631, 0.9472956057711311, 0.6865406153238026, 0.5434481849924238, 0.9457144739442651, 0.9386652612161539, 0.8753541076487252], 'f1_score_train': [0.24430264357338197, 0.09939759036144577, 0.17173051519154559, 0.6446232626188734, 0.7954094644375178, 0.24191616766467064, 0.24093816631130063, 0.39802336028751123], 'f1_score_test': [0.2722117202268431, 0.06984126984126984, 0.15568862275449102, 0.624173180998196, 0.7837535014005603, 0.1958762886597938, 0.2, 0.35976789168278533], 'precision_train': [0.5690021231422505, 0.6470588235294118, 0.65, 0.7299536116633533, 0.745848279527036, 0.6121212121212121, 0.5947368421052631, 0.6429608127721336], 'precision_test': [0.6605504587155964, 0.55, 0.5416666666666666, 0.7188365650969529, 0.7324607329842932, 0.475, 0.48936170212765956

Unnamed: 0,target,majority_class,majority_percent,f1_score_train,f1_score_test,precision_train,precision_test,recall_train,recall_test
0,arrests,0,0.8588,0.2443,0.2722,0.569,0.6606,0.1555,0.1714
1,accomodation,0,0.8998,0.0994,0.0698,0.6471,0.55,0.0538,0.0373
2,beatings,0,0.9473,0.1717,0.1557,0.65,0.5417,0.0989,0.0909
3,crowddispersal,0,0.6865,0.6446,0.6242,0.73,0.7188,0.5772,0.5515
4,ignore,1,0.5434,0.7954,0.7838,0.7458,0.7325,0.852,0.8428
5,killings,0,0.9457,0.2419,0.1959,0.6121,0.475,0.1507,0.1234
6,shootings,0,0.9387,0.2409,0.2,0.5947,0.4894,0.1511,0.1257
7,violent_response,0,0.8754,0.398,0.3598,0.643,0.5741,0.2882,0.262


Unnamed: 0,target,majority_class,majority_percent,f1_score_train,f1_score_test,precision_train,precision_test,recall_train,recall_test
0,arrests,0,0.8588,0.2443,0.2722,0.569,0.6606,0.1555,0.1714
1,accomodation,0,0.8998,0.0994,0.0698,0.6471,0.55,0.0538,0.0373
2,beatings,0,0.9473,0.1717,0.1557,0.65,0.5417,0.0989,0.0909
3,crowddispersal,0,0.6865,0.6446,0.6242,0.73,0.7188,0.5772,0.5515
4,ignore,1,0.5434,0.7954,0.7838,0.7458,0.7325,0.852,0.8428
5,killings,0,0.9457,0.2419,0.1959,0.6121,0.475,0.1507,0.1234
6,shootings,0,0.9387,0.2409,0.2,0.5947,0.4894,0.1511,0.1257
7,violent_response,0,0.8754,0.398,0.3598,0.643,0.5741,0.2882,0.262


In [211]:
mass['arrests'].value_counts(normalize=True)

0    0.858818
1    0.141182
Name: arrests, dtype: float64

In [212]:
mass['arrests'].value_counts(normalize=True).idxmax()

0

In [213]:
max(mass['arrests'].value_counts(normalize=True))

0.8588181039594176

In [214]:
# mass.info(verbose=True)

_Saving `mass` DataFrame again now that it has the viole