In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, cross_val_score

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import (accuracy_score, 
                            confusion_matrix, 
                            classification_report,
                            f1_score,
                            plot_confusion_matrix,
                            precision_recall_curve,
                            precision_score,
                            recall_score)

In [187]:
pwd

'/Users/emilynaftalin/Data_Science/General Assembly/dsi/projects/Mass-Protests/users'

In [190]:
mass = pd.read_csv('../data/mass_mobile.csv')

In [191]:
mass.drop(columns=['Unnamed: 0'], inplace=True)

In [192]:
mass.head(2)

Unnamed: 0,id,country,ccode,region,protestnumber,protesterviolence,location,protesteridentity,sources,notes,...,social_restrictions,start_date,end_date,target,notes_clean,neg,neu,pos,compound,protest_duration
0,201990001,Canada,20,North America,1,0.0,national,unspecified,1. great canadian train journeys into history;...,canada s railway passenger system was finally ...,...,0,1990-01-15,1990-01-15,"[0, 0, 0, 0, 1, 0, 0]",canada s railway passenger system was finally ...,0.087,0.913,0.0,-0.8176,1
1,201990002,Canada,20,North America,2,0.0,"Montreal, Quebec",unspecified,1. autonomy s cry revived in quebec the new yo...,protestors were only identified as young peopl...,...,0,1990-06-25,1990-06-25,"[0, 0, 0, 0, 1, 0, 0]",protestors were only identified as young peopl...,0.0,1.0,0.0,0.0,1


In [193]:
sorted(list(mass.columns))

['accomodation',
 'arrests',
 'beatings',
 'ccode',
 'compound',
 'country',
 'country_Afghanistan',
 'country_Albania',
 'country_Algeria',
 'country_Angola',
 'country_Argentina',
 'country_Armenia',
 'country_Austria',
 'country_Azerbaijan',
 'country_Bahrain',
 'country_Bangladesh',
 'country_Belarus',
 'country_Belgium',
 'country_Benin',
 'country_Bolivia',
 'country_Bosnia',
 'country_Botswana',
 'country_Brazil',
 'country_Bulgaria',
 'country_Burkina Faso',
 'country_Burundi',
 'country_Cambodia',
 'country_Cameroon',
 'country_Canada',
 'country_Cape Verde',
 'country_Central African Republic',
 'country_Chad',
 'country_Chile',
 'country_China',
 'country_Colombia',
 'country_Comoros',
 'country_Congo Brazzaville',
 'country_Congo Kinshasa',
 'country_Costa Rica',
 'country_Croatia',
 'country_Cuba',
 'country_Cyprus',
 'country_Czech Republic',
 'country_Czechoslovakia',
 'country_Denmark',
 'country_Djibouti',
 'country_Dominican Republic',
 'country_Ecuador',
 'country_Eg

In [178]:
# dummifying protest_size and protester_id_type columns 
# mass = pd.get_dummies(mass, columns=['protest_size_category', 'protester_id_type'], dummy_na=True)

In [194]:
mass.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15179 entries, 0 to 15178
Columns: 234 entries, id to protest_duration
dtypes: float64(10), int64(212), object(12)
memory usage: 27.1+ MB


I will create three functions: 

In [195]:
def evaluate_model(y_true, y_preds):
    
#     ```
#     DOCSTRING HERE 
        
    f1 = f1_score(y_true, y_preds)
    
    precision = precision_score(y_true, y_preds)
    
    recall = recall_score(y_true, y_preds)

    # add accuracy

    return f1, precision, recall

In [181]:
# def build_confusion_matrix(y_true, y_preds, title):
    
#     tn, fp, fn, tp = confusion_matrix(y_true, y_preds).ravel()
#     disp = plot_confusion_matrix(model, 
#                                  X_test_sc, 
#                                  y_test, 
# #                                  display_labels=
#                                  cmap="Blues")
    
#     disp.ax_.set_title(title)
#     print(title)
#     print(disp.confusion_matrix)
    
#     plt.show
    
#     return

In [196]:
def response_prediction_columns(model, df, features):
    
    ss = StandardScaler()
    
    X = features
    X_sc = ss.fit_transform(X)
    
    predicted_column = model.predict(X_sc)
    probability_column = model.predict_proba(X_sc)[:,1]
    
    return predicted_column, probability_column

In [197]:
def state_response_predictor(df, features, target, model):
    
    X = features 
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

    ss = StandardScaler()

    X_train_sc = ss.fit_transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    model.fit(X_train_sc, y_train)
    
    y_preds_train = model.predict(X_train_sc)
    y_preds_test = model.predict(X_test_sc)
    
    f1_score_train, precision_train, recall_train = evaluate_model(y_train, y_preds_train)
    f1_score_test, precision_test, recall_test = evaluate_model(y_test, y_preds_test)
    
    print(f'{target}- F1_score for {model} model, train set: {f1_score_train}')
    print(f'{target}- F1_score for {model} model, test set: {f1_score_test}')
    print(f'{target}- Precision for {model} model, train set: {precision_train}')
    print(f'{target}- Precision for {model} model, test set: {precision_test}')
    print(f'{target}- Recall for {model} model, train set: {recall_train}')
    print(f'{target}- Recall for {model} model, teset set: {recall_test}')
    
    predicted_column, probability_column = response_prediction_columns(model, df, features)
    
#     print(model)

#     build_confusion_matrix(y_test, y_preds_test)

    return predicted_column, probability_column, f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test  

In [198]:
# instantiating variables I will pass into my function

features = mass.drop(columns=['country', 'ccode', 'region', 'location','protesteridentity', 'sources',
           'notes', 'protester_id_type', 'protest_size_category', 'start_date', 'notes_clean', 'neg', 'neu', 'pos',
           'end_date', 'target', 'arrests', 'accomodation', 'beatings',
           'crowddispersal', 'ignore', 'killings', 'shootings'])

LogisticRegression_1 = LogisticRegression(solver='lbfgs', max_iter=1000)

# lr_arrests_1 = state_response_predictor(mass, features_1, 'arrests', lr_1)

In [199]:
mass['arrests_predicted'], mass['arrests_probability'], f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'arrests', LogisticRegression_1)

arrests- F1_score for LogisticRegression(max_iter=1000) model, train set: 0.24430264357338197
arrests- F1_score for LogisticRegression(max_iter=1000) model, test set: 0.2722117202268431
arrests- Precision for LogisticRegression(max_iter=1000) model, train set: 0.5690021231422505
arrests- Precision for LogisticRegression(max_iter=1000) model, test set: 0.6605504587155964
arrests- Recall for LogisticRegression(max_iter=1000) model, train set: 0.1555426581543819
arrests- Recall for LogisticRegression(max_iter=1000) model, teset set: 0.17142857142857143


*Now that we've tested it on `arrests`, below I apply the the `state_response_predictor` function to the other six state response. This will create a column in the original `mass` DataFrame for the binary prediction and the probability for each response.*

Responses: `'arrests','accomodation','beatings','crowddispersal','ignore','killings','shootings'`

In [201]:
mass['accomodation_predicted'], mass['accomodation_probability'], f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'accomodation', LogisticRegression_1)

accomodation- F1_score for LogisticRegression(max_iter=1000) model, train set: 0.09939759036144577
accomodation- F1_score for LogisticRegression(max_iter=1000) model, test set: 0.06984126984126984
accomodation- Precision for LogisticRegression(max_iter=1000) model, train set: 0.6470588235294118
accomodation- Precision for LogisticRegression(max_iter=1000) model, test set: 0.55
accomodation- Recall for LogisticRegression(max_iter=1000) model, train set: 0.053833605220228384
accomodation- Recall for LogisticRegression(max_iter=1000) model, teset set: 0.03728813559322034


In [202]:
# looking at where accomodation is '1' and it was also correctly predicted 

mass[ (mass['accomodation'] == mass['accomodation_predicted']) & mass['accomodation']==1 ][['accomodation', 'accomodation_predicted', 'accomodation_probability']]

Unnamed: 0,accomodation,accomodation_predicted,accomodation_probability
451,1,1,0.921425
1438,1,1,0.609061
1469,1,1,0.550764
1679,1,1,0.596815
2061,1,1,0.540708
...,...,...,...
14736,1,1,0.609921
14737,1,1,0.840545
15133,1,1,0.524657
15149,1,1,0.531713


In [203]:
mass['beatings_predicted'], mass['beatings_probability'], f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'beatings', LogisticRegression_1)

beatings- F1_score for LogisticRegression(max_iter=1000) model, train set: 0.17173051519154559
beatings- F1_score for LogisticRegression(max_iter=1000) model, test set: 0.15568862275449102
beatings- Precision for LogisticRegression(max_iter=1000) model, train set: 0.65
beatings- Precision for LogisticRegression(max_iter=1000) model, test set: 0.5416666666666666
beatings- Recall for LogisticRegression(max_iter=1000) model, train set: 0.0989345509893455
beatings- Recall for LogisticRegression(max_iter=1000) model, teset set: 0.09090909090909091


In [204]:
mass['crowddispersal_predicted'], mass['crowddispersal_probability'], f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'crowddispersal', LogisticRegression_1)

crowddispersal- F1_score for LogisticRegression(max_iter=1000) model, train set: 0.6446232626188734
crowddispersal- F1_score for LogisticRegression(max_iter=1000) model, test set: 0.624173180998196
crowddispersal- Precision for LogisticRegression(max_iter=1000) model, train set: 0.7299536116633533
crowddispersal- Precision for LogisticRegression(max_iter=1000) model, test set: 0.7188365650969529
crowddispersal- Recall for LogisticRegression(max_iter=1000) model, train set: 0.5771548336389835
crowddispersal- Recall for LogisticRegression(max_iter=1000) model, teset set: 0.5515409139213603


In [205]:
mass['ignore_predicted'], mass['ignore_probability'], f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'ignore', LogisticRegression_1)

ignore- F1_score for LogisticRegression(max_iter=1000) model, train set: 0.7954094644375178
ignore- F1_score for LogisticRegression(max_iter=1000) model, test set: 0.7837535014005603
ignore- Precision for LogisticRegression(max_iter=1000) model, train set: 0.745848279527036
ignore- Precision for LogisticRegression(max_iter=1000) model, test set: 0.7324607329842932
ignore- Recall for LogisticRegression(max_iter=1000) model, train set: 0.8520261041129155
ignore- Recall for LogisticRegression(max_iter=1000) model, teset set: 0.8427710843373494


In [206]:
mass['killings_predicted'], mass['killings_probability'], f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'killings', LogisticRegression_1)

killings- F1_score for LogisticRegression(max_iter=1000) model, train set: 0.24191616766467064
killings- F1_score for LogisticRegression(max_iter=1000) model, test set: 0.1958762886597938
killings- Precision for LogisticRegression(max_iter=1000) model, train set: 0.6121212121212121
killings- Precision for LogisticRegression(max_iter=1000) model, test set: 0.475
killings- Recall for LogisticRegression(max_iter=1000) model, train set: 0.15074626865671642
killings- Recall for LogisticRegression(max_iter=1000) model, teset set: 0.12337662337662338


In [207]:
mass['shootings_predicted'], mass['shootings_probability'], f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = \
state_response_predictor(mass, features, 'shootings', LogisticRegression_1)

shootings- F1_score for LogisticRegression(max_iter=1000) model, train set: 0.24093816631130063
shootings- F1_score for LogisticRegression(max_iter=1000) model, test set: 0.2
shootings- Precision for LogisticRegression(max_iter=1000) model, train set: 0.5947368421052631
shootings- Precision for LogisticRegression(max_iter=1000) model, test set: 0.48936170212765956
shootings- Recall for LogisticRegression(max_iter=1000) model, train set: 0.15106951871657753
shootings- Recall for LogisticRegression(max_iter=1000) model, teset set: 0.12568306010928962


In [209]:
pwd

'/Users/emilynaftalin/Data_Science/General Assembly/dsi/projects/Mass-Protests/users'

Building dataframe that just includes prediction! 

In [210]:
predictions_df = mass[['id','arrests', 'arrests_predicted', 'arrests_probability', 'accomodation', 'accomodation_predicted', 'accomodation_probability', 
      'beatings', 'beatings_predicted', 'beatings_probability', 'crowddispersal', 'crowddispersal_predicted', 'crowddispersal_probability', 
      'ignore', 'ignore_predicted', 'ignore_probability', 'killings', 'killings_predicted', 'killings_probability',
      'killings', 'killings_predicted', 'killings_probability']].copy()

In [211]:
predictions_df.head()

Unnamed: 0,id,arrests,arrests_predicted,arrests_probability,accomodation,accomodation_predicted,accomodation_probability,beatings,beatings_predicted,beatings_probability,...,crowddispersal_probability,ignore,ignore_predicted,ignore_probability,killings,killings_predicted,killings_probability,killings.1,killings_predicted.1,killings_probability.1
0,201990001,0,0,0.148069,0,0,0.12599,0,0,0.000192,...,0.084188,1,1,0.706858,0,0,0.004661,0,0,0.004661
1,201990002,0,0,0.132347,0,0,0.083107,0,0,0.000267,...,0.064624,1,1,0.749024,0,0,0.002015,0,0,0.002015
2,201990003,0,0,0.082134,0,0,0.041444,0,0,0.000125,...,0.042882,1,1,0.807295,0,0,0.000728,0,0,0.000728
3,201990004,0,1,0.585644,1,0,0.295309,0,0,0.000685,...,0.526179,0,0,0.06622,0,0,0.123478,0,0,0.123478
4,201990005,1,0,0.37017,1,0,0.090058,0,0,0.000518,...,0.376911,0,0,0.166469,0,0,0.020468,0,0,0.020468


In [128]:
predictions_df.to_csv('../data/04_predictions_only.csv', index=False)

In [213]:
model_performance_dict = {
    'target':[],
#     'model':[],
    'f1_score_train':[],
    'f1_score_test':[],
    'precision_train':[],
    'precision_test':[],
    'recall_train':[],
    'recall_test':[] 
}
 
targets = ['arrests','accomodation','beatings','crowddispersal','ignore','killings','shootings']    
    
for target in targets:
    
    predicted_column, probability_column, f1_score_train, f1_score_test, precision_train, precision_test, recall_train, recall_test = state_response_predictor(
                                                                                                                                mass, 
                                                                                                                                features, 
                                                                                                                                target, 
                                                                                                                                LogisticRegression_1)
    
    model_performance_dict['target'].append(target)
#     model_performance_dict['class_balance'].append(mass[classifier].value_counts(normalize=True))
#     model_performance_dict['model'].append(model)
    model_performance_dict['f1_score_train'].append(f1_score_train)
    model_performance_dict['f1_score_test'].append(f1_score_test)
    model_performance_dict['precision_train'].append(precision_train)
    model_performance_dict['precision_test'].append(precision_test)
    model_performance_dict['recall_train'].append(recall_train)
    model_performance_dict['recall_test'].append(recall_test)
    
print(model_performance_dict)
    
# model_performance_df = pd.DataFrame(model_performance_dict)

arrests- F1_score for LogisticRegression(max_iter=1000) model, train set: 0.24430264357338197
arrests- F1_score for LogisticRegression(max_iter=1000) model, test set: 0.2722117202268431
arrests- Precision for LogisticRegression(max_iter=1000) model, train set: 0.5690021231422505
arrests- Precision for LogisticRegression(max_iter=1000) model, test set: 0.6605504587155964
arrests- Recall for LogisticRegression(max_iter=1000) model, train set: 0.1555426581543819
arrests- Recall for LogisticRegression(max_iter=1000) model, teset set: 0.17142857142857143
accomodation- F1_score for LogisticRegression(max_iter=1000) model, train set: 0.09939759036144577
accomodation- F1_score for LogisticRegression(max_iter=1000) model, test set: 0.06984126984126984
accomodation- Precision for LogisticRegression(max_iter=1000) model, train set: 0.6470588235294118
accomodation- Precision for LogisticRegression(max_iter=1000) model, test set: 0.55
accomodation- Recall for LogisticRegression(max_iter=1000) model

In [214]:
model_performance_df = pd.DataFrame(model_performance_dict)
model_performance_df

Unnamed: 0,target,f1_score_train,f1_score_test,precision_train,precision_test,recall_train,recall_test
0,arrests,0.244303,0.272212,0.569002,0.66055,0.155543,0.171429
1,accomodation,0.099398,0.069841,0.647059,0.55,0.053834,0.037288
2,beatings,0.171731,0.155689,0.65,0.541667,0.098935,0.090909
3,crowddispersal,0.644623,0.624173,0.729954,0.718837,0.577155,0.551541
4,ignore,0.795409,0.783754,0.745848,0.732461,0.852026,0.842771
5,killings,0.241916,0.195876,0.612121,0.475,0.150746,0.123377
6,shootings,0.240938,0.2,0.594737,0.489362,0.15107,0.125683


In [215]:
mass['arrests'].value_counts(normalize=True)

0    0.858818
1    0.141182
Name: arrests, dtype: float64

In [216]:
mass['ignore'].value_counts(normalize=True)

1    0.543448
0    0.456552
Name: ignore, dtype: float64

In [217]:
mass['beatings'].value_counts(normalize=True)

0    0.947296
1    0.052704
Name: beatings, dtype: float64

In [218]:
mass['shootings'].value_counts(normalize=True)

0    0.938665
1    0.061335
Name: shootings, dtype: float64

In [219]:
mass['killings'].value_counts(normalize=True)

0    0.945714
1    0.054286
Name: killings, dtype: float64

In [220]:
mass['accomodation'].value_counts(normalize=True)

0    0.899796
1    0.100204
Name: accomodation, dtype: float64

In [221]:
mass['crowddispersal'].value_counts(normalize=True)

0    0.686541
1    0.313459
Name: crowddispersal, dtype: float64