In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from textblob import TextBlob
import matplotlib.pyplot as plt
import numpy as np

# %install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
%load_ext autotime
%matplotlib inline

In [2]:
df = pd.read_csv('./shooting_text_snippets.csv')

time: 504 ms


In [None]:
df.shape

In [None]:
def process_content(df, col):
    
    stop_words = stopwords.words('english')

    # tokenization
    df['tokenized_words'] = df[col].apply(word_tokenize)
    
    # removing stop words
    df['tokenized_stopped'] = df['tokenized_words'].apply(lambda x: [item for item in x if item not in stop_words])
    
    # POS tagging
    df['tagged_stopped'] = df['tokenized_stopped'].apply(lambda x: nltk.pos_tag(x))
    df['tagged'] = df['tokenized_words'].apply(lambda x: nltk.pos_tag(x))

    # Selecting adjectives
    is_adj = lambda pos: pos[:2].__contains__('JJ')
    df['adjectives'] = df['tagged_stopped'].apply(lambda x: [word for (word, pos) in x if is_adj(pos)])
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    df['lemmatized'] = df['adjectives'].apply(lambda x: [lemmatizer.lemmatize(item) for item in x])
    
    # Cleaning the result
    df['lemmatized'].str.lower()
    df['lemmatized'] = df['lemmatized'].apply(lambda x: [w for w in x if w.isalpha()])

    return df

df = process_content(df, 'snippet')

In [None]:
df.to_csv('./df_new_0410.csv')

In [26]:
df = pd.read_csv('./df_new_0410.csv')

time: 2.3 s


In [None]:
df.reset_index(inplace=True)
df_grouped = pd.DataFrame(df.groupby('ia_show_id')['lemmatized'].apply(lambda x: x.sum()))
df_grouped.head()

In [None]:
# nltk.FreqDist(df_grouped[''])     ### this can be used later
# most_common(15)

In [None]:
def sent_score(word):
    score = 0
    a = TextBlob(word).sentiment
    score += (a.polarity * a.subjectivity)
    return score

In [None]:
df_grouped['score'] = df_grouped['lemmatized'].apply(lambda x: [sent_score(item) for item in x])
df_grouped['sentiment'] = df_grouped['score'].apply(lambda x: sum(x))

In [None]:
df_grouped.reset_index(inplace=True)

In [None]:
df_new = pd.read_csv('./grouped_sentiment_0410.csv')

### textblob for whole sentences

In [None]:
df_grouped = pd.DataFrame(df.groupby('ia_show_id')['snippet'].apply(lambda x: x.sum()))
df_grouped.head()

In [None]:
text_blob = []
for i in range(df_grouped.shape[0]):
    text_blob.append(TextBlob(df_grouped['snippet'][i]))

In [None]:
df_grouped['text_blob'] = text_blob

In [None]:
sent = []
for i in range(df_grouped.shape[0]):
    sent.append(df_grouped['text_blob'][i].sentences)

In [None]:
df_grouped['sentences'] = sent

In [None]:
df_grouped['sentiment'] = df_grouped['text_blob'].apply(lambda x: x.sentiment)

In [None]:
score = []
for i in range(df_grouped.shape[0]):
    score.append(df_grouped['sentiment'][i].polarity * df_grouped['sentiment'][i].subjectivity)

In [None]:
df_grouped['score'] = score
df_grouped.reset_index(inplace=True)

In [None]:
df_grouped.to_csv('./grouped_sentiment_0610.csv')

###  Classification

In [None]:
df = pd.read_csv('./sentiments_with_race.csv')

In [None]:
df_new = pd.concat([df_grouped[['ia_show_id', 'score']], df[['Race', 'Gender', 'Total victims']]], axis=1)

In [None]:
df = df_new.copy(deep = True)
df.rename(columns={'score':'sentiment'}, inplace=True)
df.head()

## Modeling

In [None]:
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

### preprocessing

In [None]:
normalizer = Normalizer()
score_scaled = normalizer.fit_transform(df['sentiment'])
df['score_norm'] = score_scaled[0]

df['Gender'] = df['Gender'].str.replace('Male','M')
# df['Injured'] = df['Injured'].replace({r"[a-zA-Z]", ''}, regex=True)

In [None]:
# subset X and y
# X = df[['Fatalities', 'Injured', 'Total victims']]
# one_hot = pd.get_dummies(df[['Venue', 'Race', 'Gender']])

# X = pd.concat([X, one_hot], axis=1)
X = pd.get_dummies(df[['Race', 'Gender']])

In [None]:
y = df['score_norm']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, random_state=0, test_size=0.1)

In [None]:
train_scores = []
test_scores = []

rf = RandomForestRegressor(bootstrap=True, warm_start=True)
estimator_range = range(1, 100, 5)
for n_estimators in estimator_range:
    rf.n_estimators = n_estimators
    rf.fit(X_train, y_train)
    train_scores.append(rf.score(X_train, y_train))
    test_scores.append(rf.score(X_test, y_test))

In [None]:
plt.plot(estimator_range, test_scores, label="test scores")
plt.plot(estimator_range, train_scores, label="train scores")
plt.ylabel("accuracy")
plt.xlabel("n_estimators")
plt.legend()

In [None]:
rf.feature_importances_

In [None]:
plt.barh(range(rf.feature_importances_.shape[0]), rf.feature_importances_)
plt.yticks(range(rf.feature_importances_.shape[0]), X.columns);

In [None]:
gbrt = GradientBoostingRegressor().fit(X_train, y_train)
gbrt.score(X_test, y_test)

In [None]:
param_grid = {'learning_rate': [.5, .2, .1, .05, .02, .01, .001]}
grid = GridSearchCV(GradientBoostingRegressor(), param_grid=param_grid, cv=10)
grid.fit(X_train, y_train);

In [None]:
scores = pd.DataFrame(grid.cv_results_)

scores.plot(x='param_learning_rate', y='mean_train_score', yerr='std_train_score', ax=plt.gca())
scores.plot(x='param_learning_rate', y='mean_test_score', yerr='std_test_score', ax=plt.gca())

In [None]:
from sklearn.ensemble.partial_dependence import plot_partial_dependence
fig, axs = plot_partial_dependence(gbrt, X_train, np.argsort(gbrt.feature_importances_)[-6:],
                                       feature_names=X.columns,
                                       n_jobs=3, grid_resolution=50)
plt.tight_layout()

### binary race white-non-white

In [None]:
df['Race'] = df['Race'].str.replace('White', '0')
df['Race'] = df['Race'].str.replace('Native American', '1')
df['Race'] = df['Race'].str.replace('Latino', '1')
df['Race'] = df['Race'].str.replace('Other', '1')
df['Race'] = df['Race'].str.replace('Black', '1')
df['Race'] = df['Race'].str.replace('Asian', '1')

In [None]:
X = pd.get_dummies(df[['Race', 'Gender']])

In [None]:
y = df['score_norm']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, random_state=0, test_size=0.05)

### adjective distrubtion

In [138]:
df = pd.read_csv('./sentiments_with_race.csv')

time: 105 ms


In [141]:
df_white.sentiment.mean()

1.5745711653373082

time: 3.55 ms


In [142]:
df['lemmatized'] = df['lemmatized'].str.replace("'","")
df['lemmatized'] = df['lemmatized'].str.replace("[","")
df['lemmatized'] = df['lemmatized'].str.replace("]","")
df['lemmatized'] = df['lemmatized'].str.replace(" ","")
df['lemmatized'] = df['lemmatized'].str.split(',')

time: 92.2 ms


In [99]:
all_words = [item for sublist in adj for item in sublist]
len(all_words)
len(set(all_words))

11404

time: 42.8 ms


In [100]:
from collections import Counter
counts = Counter(all_words)

time: 44.6 ms


In [192]:
df_white = df[df['Race'] == 'White']
df_white.reset_index(inplace=True)
df_other = df[df['Race'] != 'White']
df_other.reset_index(inplace=True)



time: 9.47 ms


In [189]:
def adj_fred(df):
    
    words = []
    for i in range(df.shape[0]):
        words.append(df['lemmatized'][i])
    w_list = [y for x in words for y in x]
    return nltk.FreqDist(w_list)

time: 3.87 ms


In [190]:
white_list = adj_fred(df_white)

time: 28.3 ms


In [193]:
other_list = adj_fred(df_other)

time: 91.3 ms


In [203]:
white_key = list(white_list.keys())
white_values = list(white_list.values())

white = pd.DataFrame()
white['adj'] = white_key
white['freq'] = white_values

time: 1.73 ms


In [208]:
other_key = list(other_list.keys())
other_values = list(other_list.values())

other = pd.DataFrame()
other['adj'] = other_key
other['freq'] = other_values

time: 12.1 ms


In [210]:
other.to_csv('./other.csv')
white.to_csv('./white.csv')

time: 49.9 ms
