In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, cross_val_score

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import (accuracy_score, 
                            confusion_matrix, 
                            classification_report,
                            f1_score,
                            plot_confusion_matrix,
                            precision_recall_curve,
                            precision_score,
                            recall_score)

In [3]:
pwd

'/Users/emilynaftalin/Data_Science/General Assembly/dsi/projects/Mass-Protests/users'

In [4]:
mass = pd.read_pickle('../data/03_dummy_df.pickle')

In [5]:
mass.head(2)

Unnamed: 0,id,country,ccode,region,protestnumber,protesterviolence,location,protesteridentity,sources,notes,...,shootings,labor_wage_dispute,land_farm_issue,police_brutality,political_behavior_process,price increases_tax_policy,removal_of_politician,social_restrictions,start_date,end_date
0,201990001,Canada,20,North America,1,0.0,national,unspecified,1. great canadian train journeys into history;...,canada s railway passenger system was finally ...,...,0,1,0,0,1,0,0,0,1990-01-15,1990-01-15
1,201990002,Canada,20,North America,2,0.0,"Montreal, Quebec",unspecified,1. autonomy s cry revived in quebec the new yo...,protestors were only identified as young peopl...,...,0,0,0,0,1,0,0,0,1990-06-25,1990-06-25


In [6]:
list(mass.columns)

['id',
 'country',
 'ccode',
 'region',
 'protestnumber',
 'protesterviolence',
 'location',
 'protesteridentity',
 'sources',
 'notes',
 'protester_id_type',
 'partipants_number',
 'protest_size_category',
 'pop_male',
 'pop_female',
 'pop_total',
 'pop_density',
 'prosperity_2020',
 'country_Afghanistan',
 'country_Albania',
 'country_Algeria',
 'country_Angola',
 'country_Argentina',
 'country_Armenia',
 'country_Austria',
 'country_Azerbaijan',
 'country_Bahrain',
 'country_Bangladesh',
 'country_Belarus',
 'country_Belgium',
 'country_Benin',
 'country_Bolivia',
 'country_Bosnia',
 'country_Botswana',
 'country_Brazil',
 'country_Bulgaria',
 'country_Burkina Faso',
 'country_Burundi',
 'country_Cambodia',
 'country_Cameroon',
 'country_Canada',
 'country_Cape Verde',
 'country_Central African Republic',
 'country_Chad',
 'country_Chile',
 'country_China',
 'country_Colombia',
 'country_Comoros',
 'country_Congo Brazzaville',
 'country_Congo Kinshasa',
 'country_Costa Rica',
 'coun

In [7]:
# dummifying protest_size and protester_id_type columns 
mass = pd.get_dummies(mass, columns=['protest_size_category', 'protester_id_type'], dummy_na=True)

Below, attempting a Logistic Regression for each state response, starting with `arrests`.

In [8]:
X = mass.drop(columns=['country','ccode','location','region','sources','notes', 'protesteridentity','start_date','end_date', 
                              'arrests','accomodation','beatings','crowddispersal','ignore','killings','shootings'])
y = mass['arrests']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

lr = LogisticRegression(solver='lbfgs', max_iter=1000)

lr.fit(X_train_sc, y_train)

LogisticRegression(max_iter=1000)

In [9]:
lr.score(X_train_sc, y_train)

0.8637931034482759

In [10]:
lr.score(X_test_sc, y_test)

0.851888341543514

Now that I have created and evaluated the Logistic Regression model using the training and test set, I will scale the entire `X` set and make predictions for each row in the DataFrame. This will allow me to add two new columns: 
+ **`arrests_predicted`**: column of 0s and 1s indicating whether the protest is predicted to have arrests as one of the state responses. 
+ **`arrests_probability`**: the calculated probability that the protest will have arrests as one of the state responses

In [11]:
ss = StandardScaler()

X_sc = ss.fit_transform(X)

In [12]:
arrests_predicted_all = lr.predict(X_sc)

arrests_probability_all = lr.predict_proba(X_sc)[:,1]

In [13]:
mass['arrests_predicted'] = arrests_predicted_all

mass['arrests_probability'] = arrests_probability_all 

In [14]:
# checking out how it looks 
mass[['arrests', 'arrests_predicted','arrests_probability']][22:33]

Unnamed: 0,arrests,arrests_predicted,arrests_probability
22,0,0,0.379786
23,1,1,0.606608
24,0,0,0.209131
25,0,1,0.594836
26,0,0,0.162792
27,0,0,0.392477
28,0,0,0.120245
29,1,1,0.53606
30,1,0,0.472129
31,0,0,0.211884


In [15]:
mass.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15225 entries, 0 to 15224
Columns: 229 entries, id to arrests_probability
dtypes: datetime64[ns](2), float64(7), int32(180), int64(12), object(6), uint8(22)
memory usage: 14.0+ MB


I will create three functions: 

In [16]:
def state_response_predictor(model, df, features, target):
    
    X = features 
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

    ss = StandardScaler()

    X_train_sc = ss.fit_transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    model.fit(X_train_sc, y_train)
    
    y_preds = model.predict(X_test_sc)
    
    f1_score, precision, recall = evaluate_model(y_test, y_preds) 
    
    #confusion matrix
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_preds).ravel()
    plot_confusion_matrix(grid, X_test_sc, y_test, cmap="Blues")  

    return

In [17]:
def evaluate_model(y_true, y_preds):
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_preds).ravel()
    
    f1_score = f1_score(y_true, y_preds)
    
    precision = precision_score(y_true, y_preds)
    
    recall = recall_score(y_true, y_preds)

    return f1_score, precision, recall

In [18]:
def response_prediction_columns(model, df, features):
    
    ss = StandardScaler()
    
    X = features
    X_sc = ss.fit_transform(X)
    
    predicted_column = model.predict(X_sc)
    probability_column = model.predict_proba(X_sc)[:,1]
    
    return predicted_column, probability_column

# ss = StandardScaler()

# X_sc = ss.fit_transform(X)

# arrests_predicted_all = lr.predict(X_sc)

# arrests_probability_all = lr.predict_proba(X_sc)[:,1]

# mass['arrests_predicted'] = arrests_predicted_all

# mass['arrests_probability'] = arrests_probability_all 