In [2]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from textblob import TextBlob
import matplotlib.pyplot as plt
import numpy as np

# %install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
%load_ext autotime
%matplotlib inline

In [2]:
df = pd.read_csv('./shooting_text_snippets.csv')

time: 510 ms


In [None]:
df.shape

In [None]:
def process_content(df, col):
    
    stop_words = stopwords.words('english')

    # tokenization
    df['tokenized_words'] = df[col].apply(word_tokenize)
    
    # removing stop words
    df['tokenized_stopped'] = df['tokenized_words'].apply(lambda x: [item for item in x if item not in stop_words])
    
    # POS tagging
    df['tagged_stopped'] = df['tokenized_stopped'].apply(lambda x: nltk.pos_tag(x))
    df['tagged'] = df['tokenized_words'].apply(lambda x: nltk.pos_tag(x))

    # Selecting adjectives
    is_adj = lambda pos: pos[:2].__contains__('JJ')
    df['adjectives'] = df['tagged_stopped'].apply(lambda x: [word for (word, pos) in x if is_adj(pos)])
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    df['lemmatized'] = df['adjectives'].apply(lambda x: [lemmatizer.lemmatize(item) for item in x])
    
    # Cleaning the result
    df['lemmatized'].str.lower()
    df['lemmatized'] = df['lemmatized'].apply(lambda x: [w for w in x if w.isalpha()])

    return df

df = process_content(df, 'snippet')

In [None]:
df.to_csv('./df_new_0410.csv')

In [None]:
df = pd.read_csv('./df_new_0410.csv')

In [None]:
df.reset_index(inplace=True)
df_grouped = pd.DataFrame(df.groupby('ia_show_id')['lemmatized'].apply(lambda x: x.sum()))
df_grouped.head()

In [None]:
# nltk.FreqDist(df_grouped[''])     ### this can be used later
# most_common(15)

In [None]:
def sent_score(word):
    score = 0
    a = TextBlob(word).sentiment
    score += (a.polarity * a.subjectivity)
    return score

In [None]:
df_grouped['score'] = df_grouped['lemmatized'].apply(lambda x: [sent_score(item) for item in x])
df_grouped['sentiment'] = df_grouped['score'].apply(lambda x: sum(x))

In [None]:
df_grouped.reset_index(inplace=True)

In [None]:
df_new = pd.read_csv('./grouped_sentiment_0410.csv')

### textblob for whole sentences

In [3]:
df_grouped = pd.DataFrame(df.groupby('ia_show_id')['snippet'].apply(lambda x: x.sum()))
df_grouped.head()

Unnamed: 0_level_0,snippet
ia_show_id,Unnamed: 1_level_1
BLOOMBERG_20140220_150000_Market_Makers,the population here in kiev is in a state of s...
BLOOMBERG_20140220_180000_Bloomberg_West,the death toll has risen to 64. are bloomberg ...
BLOOMBERG_20140221_130000_In_the_Loop_With_Betty_Liu,"for gm, this is a huge market. really for all ..."
BLOOMBERG_20140221_200000_Street_Smart_with_Trish_Regan_and_Adam_Johnson,these elements all play into just civil unrest...
BLOOMBERG_20140403_050000_Countdown,there is a strong data. it is setting up. than...


time: 626 ms


In [4]:
text_blob = []
for i in range(df_grouped.shape[0]):
    text_blob.append(TextBlob(df_grouped['snippet'][i]))

time: 416 ms


In [5]:
df_grouped['text_blob'] = text_blob

time: 1.73 ms


In [6]:
sent = []
for i in range(df_grouped.shape[0]):
    sent.append(df_grouped['text_blob'][i].sentences)

time: 11.3 s


In [7]:
df_grouped['sentences'] = sent

time: 2.22 ms


In [8]:
df_grouped['sentiment'] = df_grouped['text_blob'].apply(lambda x: x.sentiment)

time: 23.4 s


In [9]:
score = []
for i in range(df_grouped.shape[0]):
    score.append(df_grouped['sentiment'][i].polarity * df_grouped['sentiment'][i].subjectivity)

time: 131 ms


In [10]:
df_grouped['score'] = score
df_grouped.reset_index(inplace=True)

time: 3.21 ms


In [None]:
df_grouped.to_csv('./grouped_sentiment_0610.csv')

### removing locations

In [4]:
df_grouped = pd.read_csv('./grouped_sentiment_0610.csv')

time: 594 ms


In [12]:
def process_content(df, col):
    
    stop_words = stopwords.words('english')

    # tokenization
    df['tokenized_words'] = df[col].apply(word_tokenize)
    
    # removing stop words
#     df['tokenized_stopped'] = df['tokenized_words'].apply(lambda x: [item for item in x if item not in stop_words])
    
    # POS tagging
#     df['tagged_stopped'] = df['tokenized_stopped'].apply(lambda x: nltk.pos_tag(x))
    df['tagged'] = df['tokenized_words'].apply(lambda x: nltk.pos_tag(x))

#     df['namedEnt'] = df['tagged_stopped'].apply(lambda x: nltk.ne_chunk(x, binary=True))

#     # Selecting adjectives
#     is_adj = lambda pos: pos[:2].__contains__('JJ')
#     df['adjectives'] = df['tagged_stopped'].apply(lambda x: [word for (word, pos) in x if is_adj(pos)])
    
#     # Lemmatization
#     lemmatizer = WordNetLemmatizer()
#     df['lemmatized'] = df['adjectives'].apply(lambda x: [lemmatizer.lemmatize(item) for item in x])
    
#     # Cleaning the result
#     df['lemmatized'].str.lower()
#     df['lemmatized'] = df['lemmatized'].apply(lambda x: [w for w in x if w.isalpha()])

    return df

df_ = process_content(df_grouped, 'snippet')

time: 3min 12s


In [14]:
df_['namedEnt'] = df_['tagged'].apply(lambda x: nltk.ne_chunk(x, binary=False))
df_['namedEnt'].head()

0    [(the, DT), (population, NN), (here, RB), (in,...
1    [(the, DT), (death, NN), (toll, NN), (has, VBZ...
2    [(for, IN), (gm, NN), (,, ,), (this, DT), (is,...
3    [(these, DT), (elements, NNS), (all, DT), (pla...
4    [(there, EX), (is, VBZ), (a, DT), (strong, JJ)...
Name: namedEnt, dtype: object

time: 19min 22s


In [16]:
df_.head()

Unnamed: 0.1,Unnamed: 0,ia_show_id,snippet,text_blob,sentences,score,sentiment,tokenized_words,tokenized_stopped,tagged_stopped,namedEnt,adjectives,tagged
0,0,BLOOMBERG_20140220_150000_Market_Makers,the population here in kiev is in a state of s...,the population here in kiev is in a state of s...,"[Sentence(""the population here in kiev is in a...",-0.061481,"Sentiment(polarity=-0.13333333333333333, subje...","[the, population, here, in, kiev, is, in, a, s...","[population, kiev, state, shock, ., never, exp...","[(population, NN), (kiev, NN), (state, NN), (s...","[(the, DT), (population, NN), (here, RB), (in,...","[last, chaotic, population.i, coming, speak, j...","[(the, DT), (population, NN), (here, RB), (in,..."
1,1,BLOOMBERG_20140220_180000_Bloomberg_West,the death toll has risen to 64. are bloomberg ...,the death toll has risen to 64. are bloomberg ...,"[Sentence(""the death toll has risen to 64. are...",0.058265,"Sentiment(polarity=0.1202020202020202, subject...","[the, death, toll, has, risen, to, 64., are, b...","[death, toll, risen, 64., bloomberg, news, rep...","[(death, NN), (toll, NN), (risen, VBP), (64., ...","[(the, DT), (death, NN), (toll, NN), (has, VBZ...","[bloomberg, live, central, latest, nearby, squ...","[(the, DT), (death, NN), (toll, NN), (has, VBZ..."
2,2,BLOOMBERG_20140221_130000_In_the_Loop_With_Bet...,"for gm, this is a huge market. really for all ...","for gm, this is a huge market. really for all ...","[Sentence(""for gm, this is a huge market.""), S...",0.073395,"Sentiment(polarity=0.16111111111111112, subjec...","[for, gm, ,, this, is, a, huge, market, ., rea...","[gm, ,, huge, market, ., really, big, carmaker...","[(gm, NN), (,, ,), (huge, JJ), (market, NN), (...","[(for, IN), (gm, NN), (,, ,), (this, DT), (is,...","[huge, big, biggest, main]","[(for, IN), (gm, NN), (,, ,), (this, DT), (is,..."
3,3,BLOOMBERG_20140221_200000_Street_Smart_with_Tr...,these elements all play into just civil unrest...,these elements all play into just civil unrest...,"[Sentence(""these elements all play into just c...",-0.076667,"Sentiment(polarity=-0.10000000000000002, subje...","[these, elements, all, play, into, just, civil...","[elements, play, civil, unrest, taking, street...","[(elements, NNS), (play, VBP), (civil, JJ), (u...","[(these, DT), (elements, NNS), (all, DT), (pla...","[civil, best, angry, violent, national, tear, ...","[(these, DT), (elements, NNS), (all, DT), (pla..."
4,4,BLOOMBERG_20140403_050000_Countdown,there is a strong data. it is setting up. than...,there is a strong data. it is setting up. than...,"[Sentence(""there is a strong data.""), Sentence...",0.016825,"Sentiment(polarity=0.03333333333333334, subjec...","[there, is, a, strong, data, ., it, is, settin...","[strong, data, ., setting, ., thank, ., top, h...","[(strong, JJ), (data, NNS), (., .), (setting, ...","[(there, EX), (is, VBZ), (a, DT), (strong, JJ)...","[strong, top, top, national, texas]","[(there, EX), (is, VBZ), (a, DT), (strong, JJ)..."


time: 263 ms


In [None]:
# is_adj = lambda pos: pos[:2].__contains__('JJ')

In [None]:
# - remove location, then do adjective frequecny
# - classificion with sentence-level sentiment


##  Classification

In [None]:
df = pd.read_csv('./sentiments_with_race.csv')

In [None]:
df_new = pd.concat([df_grouped[['ia_show_id', 'score']], df[['Race', 'Gender', 'Total victims']]], axis=1)

In [None]:
df = df_new.copy(deep = True)
df.rename(columns={'score':'sentiment'}, inplace=True)
df.head()

## Modeling

In [None]:
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

### preprocessing

In [None]:
normalizer = Normalizer()
score_scaled = normalizer.fit_transform(df['sentiment'])
df['score_norm'] = score_scaled[0]

df['Gender'] = df['Gender'].str.replace('Male','M')
# df['Injured'] = df['Injured'].replace({r"[a-zA-Z]", ''}, regex=True)

In [None]:
# subset X and y
# X = df[['Fatalities', 'Injured', 'Total victims']]
# one_hot = pd.get_dummies(df[['Venue', 'Race', 'Gender']])

# X = pd.concat([X, one_hot], axis=1)
X = pd.get_dummies(df[['Race', 'Gender']])

In [None]:
y = df['score_norm']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, random_state=0, test_size=0.1)

In [None]:
train_scores = []
test_scores = []

rf = RandomForestRegressor(bootstrap=True, warm_start=True)
estimator_range = range(1, 100, 5)
for n_estimators in estimator_range:
    rf.n_estimators = n_estimators
    rf.fit(X_train, y_train)
    train_scores.append(rf.score(X_train, y_train))
    test_scores.append(rf.score(X_test, y_test))

In [None]:
plt.plot(estimator_range, test_scores, label="test scores")
plt.plot(estimator_range, train_scores, label="train scores")
plt.ylabel("accuracy")
plt.xlabel("n_estimators")
plt.legend()

In [None]:
rf.feature_importances_

In [None]:
plt.barh(range(rf.feature_importances_.shape[0]), rf.feature_importances_)
plt.yticks(range(rf.feature_importances_.shape[0]), X.columns);

In [None]:
gbrt = GradientBoostingRegressor().fit(X_train, y_train)
gbrt.score(X_test, y_test)

In [None]:
param_grid = {'learning_rate': [.5, .2, .1, .05, .02, .01, .001]}
grid = GridSearchCV(GradientBoostingRegressor(), param_grid=param_grid, cv=10)
grid.fit(X_train, y_train);

In [None]:
scores = pd.DataFrame(grid.cv_results_)

scores.plot(x='param_learning_rate', y='mean_train_score', yerr='std_train_score', ax=plt.gca())
scores.plot(x='param_learning_rate', y='mean_test_score', yerr='std_test_score', ax=plt.gca())

In [None]:
from sklearn.ensemble.partial_dependence import plot_partial_dependence
fig, axs = plot_partial_dependence(gbrt, X_train, np.argsort(gbrt.feature_importances_)[-6:],
                                       feature_names=X.columns,
                                       n_jobs=3, grid_resolution=50)
plt.tight_layout()

### binary race white-non-white

In [None]:
df['Race'] = df['Race'].str.replace('White', '0')
df['Race'] = df['Race'].str.replace('Native American', '1')
df['Race'] = df['Race'].str.replace('Latino', '1')
df['Race'] = df['Race'].str.replace('Other', '1')
df['Race'] = df['Race'].str.replace('Black', '1')
df['Race'] = df['Race'].str.replace('Asian', '1')

In [None]:
X = pd.get_dummies(df[['Race', 'Gender']])

In [None]:
y = df['score_norm']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, random_state=0, test_size=0.05)

### adjective distrubtion

In [None]:
df = pd.read_csv('./sentiments_with_race.csv')

In [None]:
df_white.sentiment.mean()

In [None]:
df['lemmatized'] = df['lemmatized'].str.replace("'","")
df['lemmatized'] = df['lemmatized'].str.replace("[","")
df['lemmatized'] = df['lemmatized'].str.replace("]","")
df['lemmatized'] = df['lemmatized'].str.replace(" ","")
df['lemmatized'] = df['lemmatized'].str.split(',')

In [None]:
all_words = [item for sublist in adj for item in sublist]
len(all_words)
len(set(all_words))

In [None]:
from collections import Counter
counts = Counter(all_words)

In [None]:
df_white = df[df['Race'] == 'White']
df_white.reset_index(inplace=True)
df_other = df[df['Race'] != 'White']
df_other.reset_index(inplace=True)



In [None]:
def adj_fred(df):
    
    words = []
    for i in range(df.shape[0]):
        words.append(df['lemmatized'][i])
    w_list = [y for x in words for y in x]
    return nltk.FreqDist(w_list)

In [None]:
white_list = adj_fred(df_white)

In [None]:
other_list = adj_fred(df_other)

In [None]:
white_key = list(white_list.keys())
white_values = list(white_list.values())

white = pd.DataFrame()
white['adj'] = white_key
white['freq'] = white_values

In [None]:
other_key = list(other_list.keys())
other_values = list(other_list.values())

other = pd.DataFrame()
other['adj'] = other_key
other['freq'] = other_values

In [None]:
other.to_csv('./other.csv')
white.to_csv('./white.csv')