In [10]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from textblob import TextBlob

# %install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 57.5 ms


In [2]:
df = pd.read_csv('./shooting_text_snippets.csv', index_col='ia_show_id')

time: 579 ms


In [3]:
df.shape

(34081, 7)

time: 15 ms


In [5]:
def process_content(df, col):
    
    stop_words = stopwords.words('english')

    # tokenization
    df['tokenized_words'] = df[col].apply(word_tokenize)
    
    # removing stop words
    df['tokenized_stopped'] = df['tokenized_words'].apply(lambda x: [item for item in x if item not in stop_words])
    
    # POS tagging
    df['tagged_stopped'] = df['tokenized_stopped'].apply(lambda x: nltk.pos_tag(x))
    df['tagged'] = df['tokenized_words'].apply(lambda x: nltk.pos_tag(x))

    # Selecting adjectives
    is_adj = lambda pos: pos[:2].__contains__('JJ')
    df['adjectives'] = df['tagged_stopped'].apply(lambda x: [word for (word, pos) in x if is_adj(pos)])
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    df['lemmatized'] = df['adjectives'].apply(lambda x: [lemmatizer.lemmatize(item) for item in x])
    
    # Cleaning the result
    df['lemmatized'].str.lower()
    df['lemmatized'] = df['lemmatized'].apply(lambda x: [w for w in x if w.isalpha()])

    return df

df = process_content(df, 'snippet')

time: 5min 43s


In [6]:
df.head()

Unnamed: 0_level_0,preview_url,date,station,show,show_date,preview_thumb,snippet,tokenized_words,tokenized_stopped,tagged_stopped,tagged,adjectives,lemmatized
ia_show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
FOXNEWSW_20171002_190000_Shepard_Smith_Reporting,https://archive.org/details/FOXNEWSW_20171002_...,2017-10-02T19:32:03Z,FOX News,Shepard Smith Reporting,2017-10-02T19:00:00Z,https://archive.org/download/FOXNEWSW_20171002...,shepard: thank you. a live look at the place w...,"[shepard, :, thank, you, ., a, live, look, at,...","[shepard, :, thank, ., live, look, place, happ...","[(shepard, NN), (:, :), (thank, NN), (., .), (...","[(shepard, NN), (:, :), (thank, NN), (you, PRP...","[live, quiet, las, hear, much]","[live, quiet, la, hear, much]"
BLOOMBERG_20171002_080000_Bloomberg_Surveillance,https://archive.org/details/BLOOMBERG_20171002...,2017-10-02T08:45:32Z,Bloomberg,Bloomberg Surveillance,2017-10-02T08:00:00Z,https://archive.org/download/BLOOMBERG_2017100...,"francine: they kill, thank you so much -- they...","[francine, :, they, kill, ,, thank, you, so, m...","[francine, :, kill, ,, thank, much, --, kill, ...","[(francine, NN), (:, :), (kill, NN), (,, ,), (...","[(francine, NN), (:, :), (they, PRP), (kill, V...","[much, latest, understand, dead, injured]","[much, latest, understand, dead, injured]"
FBC_20171002_130000_Varney__Company,https://archive.org/details/FBC_20171002_13000...,2017-10-02T13:42:51Z,FOX Business,Varney Company,2017-10-02T13:00:00Z,https://archive.org/download/FBC_20171002_1300...,"...and ask about saving $1,000 on your walk-in...","[..., and, ask, about, saving, $, 1,000, on, y...","[..., ask, saving, $, 1,000, walk-in, bath, ,,...","[(..., :), (ask, NN), (saving, VBG), ($, $), (...","[(..., :), (and, CC), (ask, VB), (about, IN), ...","[walk-in, eastern, right, vegas]","[eastern, right, vega]"
BLOOMBERG_20171002_080000_Bloomberg_Surveillance,https://archive.org/details/BLOOMBERG_20171002...,2017-10-02T09:02:10Z,Bloomberg,Bloomberg Surveillance,2017-10-02T08:00:00Z,https://archive.org/download/BLOOMBERG_2017100...,[no audio] the las vegas share says off-duty ...,"[[, no, audio, ], the, las, vegas, share, says...","[[, audio, ], las, vegas, share, says, off-dut...","[([, JJ), (audio, NN), (], NNP), (las, VBZ), (...","[([, JJ), (no, DT), (audio, JJ), (], VBZ), (th...","[[, off-duty, mccarran, least, injured, outdoo...","[mccarran, least, injured, outdoor, global, po..."
FBC_20171002_130000_Varney__Company,https://archive.org/details/FBC_20171002_13000...,2017-10-02T13:36:41Z,FOX Business,Varney Company,2017-10-02T13:00:00Z,https://archive.org/download/FBC_20171002_1300...,still congress is more than frustrating. i am ...,"[still, congress, is, more, than, frustrating,...","[still, congress, frustrating, ., furious, ., ...","[(still, RB), (congress, JJ), (frustrating, VB...","[(still, RB), (congress, NN), (is, VBZ), (more...","[congress, furious, deadliest, u.s.]","[congress, furious, deadliest]"


time: 151 ms


In [17]:
df.to_csv('./df_new_0410.csv')

time: 10.7 s


In [8]:
df.reset_index(inplace=True)
df_grouped = pd.DataFrame(df.groupby('ia_show_id')['lemmatized'].apply(lambda x: x.sum()))
df_grouped.head()

time: 809 ms


In [9]:
# df_grouped['lemmatized'].str.lower()
# df_grouped['lemmatized'] = df_grouped['lemmatized'].apply(lambda x: [w for w in x if w.isalpha()])

# nltk.FreqDist(df_grouped[''])     ### this can be used later
# most_common(15)

Unnamed: 0_level_0,lemmatized
ia_show_id,Unnamed: 1_level_1
BLOOMBERG_20140220_150000_Market_Makers,"[last, chaotic, coming, speak, jeffrey, ukrain..."
BLOOMBERG_20140220_180000_Bloomberg_West,"[bloomberg, live, central, latest, nearby, squ..."
BLOOMBERG_20140221_130000_In_the_Loop_With_Betty_Liu,"[huge, big, biggest, main]"
BLOOMBERG_20140221_200000_Street_Smart_with_Trish_Regan_and_Adam_Johnson,"[civil, best, angry, violent, national, tear, ..."
BLOOMBERG_20140403_050000_Countdown,"[strong, top, top, national, texas]"


time: 116 ms


In [11]:
def sent_score(word):
    score = 0
    a = TextBlob(word).sentiment
    score += (a.polarity * a.subjectivity)
    return score

time: 3.23 ms


In [12]:
df_grouped['score'] = df_grouped['lemmatized'].apply(lambda x: [sent_score(item) for item in x])
df_grouped['sentiment'] = df_grouped['score'].apply(lambda x: sum(x))

time: 12.9 s


In [13]:
df_grouped.to_csv('./grouped_sentiment_0410.csv', index=False)

time: 287 ms


In [14]:
df.head()

Unnamed: 0,ia_show_id,preview_url,date,station,show,show_date,preview_thumb,snippet,tokenized_words,tokenized_stopped,tagged_stopped,tagged,adjectives,lemmatized
0,FOXNEWSW_20171002_190000_Shepard_Smith_Reporting,https://archive.org/details/FOXNEWSW_20171002_...,2017-10-02T19:32:03Z,FOX News,Shepard Smith Reporting,2017-10-02T19:00:00Z,https://archive.org/download/FOXNEWSW_20171002...,shepard: thank you. a live look at the place w...,"[shepard, :, thank, you, ., a, live, look, at,...","[shepard, :, thank, ., live, look, place, happ...","[(shepard, NN), (:, :), (thank, NN), (., .), (...","[(shepard, NN), (:, :), (thank, NN), (you, PRP...","[live, quiet, las, hear, much]","[live, quiet, la, hear, much]"
1,BLOOMBERG_20171002_080000_Bloomberg_Surveillance,https://archive.org/details/BLOOMBERG_20171002...,2017-10-02T08:45:32Z,Bloomberg,Bloomberg Surveillance,2017-10-02T08:00:00Z,https://archive.org/download/BLOOMBERG_2017100...,"francine: they kill, thank you so much -- they...","[francine, :, they, kill, ,, thank, you, so, m...","[francine, :, kill, ,, thank, much, --, kill, ...","[(francine, NN), (:, :), (kill, NN), (,, ,), (...","[(francine, NN), (:, :), (they, PRP), (kill, V...","[much, latest, understand, dead, injured]","[much, latest, understand, dead, injured]"
2,FBC_20171002_130000_Varney__Company,https://archive.org/details/FBC_20171002_13000...,2017-10-02T13:42:51Z,FOX Business,Varney Company,2017-10-02T13:00:00Z,https://archive.org/download/FBC_20171002_1300...,"...and ask about saving $1,000 on your walk-in...","[..., and, ask, about, saving, $, 1,000, on, y...","[..., ask, saving, $, 1,000, walk-in, bath, ,,...","[(..., :), (ask, NN), (saving, VBG), ($, $), (...","[(..., :), (and, CC), (ask, VB), (about, IN), ...","[walk-in, eastern, right, vegas]","[eastern, right, vega]"
3,BLOOMBERG_20171002_080000_Bloomberg_Surveillance,https://archive.org/details/BLOOMBERG_20171002...,2017-10-02T09:02:10Z,Bloomberg,Bloomberg Surveillance,2017-10-02T08:00:00Z,https://archive.org/download/BLOOMBERG_2017100...,[no audio] the las vegas share says off-duty ...,"[[, no, audio, ], the, las, vegas, share, says...","[[, audio, ], las, vegas, share, says, off-dut...","[([, JJ), (audio, NN), (], NNP), (las, VBZ), (...","[([, JJ), (no, DT), (audio, JJ), (], VBZ), (th...","[[, off-duty, mccarran, least, injured, outdoo...","[mccarran, least, injured, outdoor, global, po..."
4,FBC_20171002_130000_Varney__Company,https://archive.org/details/FBC_20171002_13000...,2017-10-02T13:36:41Z,FOX Business,Varney Company,2017-10-02T13:00:00Z,https://archive.org/download/FBC_20171002_1300...,still congress is more than frustrating. i am ...,"[still, congress, is, more, than, frustrating,...","[still, congress, frustrating, ., furious, ., ...","[(still, RB), (congress, JJ), (frustrating, VB...","[(still, RB), (congress, NN), (is, VBZ), (more...","[congress, furious, deadliest, u.s.]","[congress, furious, deadliest]"


time: 189 ms


In [15]:
df['score'] = df['lemmatized'].apply(lambda x: [sent_score(item) for item in x])
df['sentiment'] = df['score'].apply(lambda x: sum(x))

time: 15.4 s


In [16]:
df.head()

Unnamed: 0,ia_show_id,preview_url,date,station,show,show_date,preview_thumb,snippet,tokenized_words,tokenized_stopped,tagged_stopped,tagged,adjectives,lemmatized,score,sentiment
0,FOXNEWSW_20171002_190000_Shepard_Smith_Reporting,https://archive.org/details/FOXNEWSW_20171002_...,2017-10-02T19:32:03Z,FOX News,Shepard Smith Reporting,2017-10-02T19:00:00Z,https://archive.org/download/FOXNEWSW_20171002...,shepard: thank you. a live look at the place w...,"[shepard, :, thank, you, ., a, live, look, at,...","[shepard, :, thank, ., live, look, place, happ...","[(shepard, NN), (:, :), (thank, NN), (., .), (...","[(shepard, NN), (:, :), (thank, NN), (you, PRP...","[live, quiet, las, hear, much]","[live, quiet, la, hear, much]","[0.06818181818181818, 0.0, 0.0, 0.0, 0.0400000...",0.108182
1,BLOOMBERG_20171002_080000_Bloomberg_Surveillance,https://archive.org/details/BLOOMBERG_20171002...,2017-10-02T08:45:32Z,Bloomberg,Bloomberg Surveillance,2017-10-02T08:00:00Z,https://archive.org/download/BLOOMBERG_2017100...,"francine: they kill, thank you so much -- they...","[francine, :, they, kill, ,, thank, you, so, m...","[francine, :, kill, ,, thank, much, --, kill, ...","[(francine, NN), (:, :), (kill, NN), (,, ,), (...","[(francine, NN), (:, :), (they, PRP), (kill, V...","[much, latest, understand, dead, injured]","[much, latest, understand, dead, injured]","[0.04000000000000001, 0.45, 0.0, -0.0800000000...",0.41
2,FBC_20171002_130000_Varney__Company,https://archive.org/details/FBC_20171002_13000...,2017-10-02T13:42:51Z,FOX Business,Varney Company,2017-10-02T13:00:00Z,https://archive.org/download/FBC_20171002_1300...,"...and ask about saving $1,000 on your walk-in...","[..., and, ask, about, saving, $, 1,000, on, y...","[..., ask, saving, $, 1,000, walk-in, bath, ,,...","[(..., :), (ask, NN), (saving, VBG), ($, $), (...","[(..., :), (and, CC), (ask, VB), (about, IN), ...","[walk-in, eastern, right, vegas]","[eastern, right, vega]","[0.0, 0.1530612244897959, 0.0]",0.153061
3,BLOOMBERG_20171002_080000_Bloomberg_Surveillance,https://archive.org/details/BLOOMBERG_20171002...,2017-10-02T09:02:10Z,Bloomberg,Bloomberg Surveillance,2017-10-02T08:00:00Z,https://archive.org/download/BLOOMBERG_2017100...,[no audio] the las vegas share says off-duty ...,"[[, no, audio, ], the, las, vegas, share, says...","[[, audio, ], las, vegas, share, says, off-dut...","[([, JJ), (audio, NN), (], NNP), (las, VBZ), (...","[([, JJ), (no, DT), (audio, JJ), (], VBZ), (th...","[[, off-duty, mccarran, least, injured, outdoo...","[mccarran, least, injured, outdoor, global, po...","[0.0, -0.12, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.12
4,FBC_20171002_130000_Varney__Company,https://archive.org/details/FBC_20171002_13000...,2017-10-02T13:36:41Z,FOX Business,Varney Company,2017-10-02T13:00:00Z,https://archive.org/download/FBC_20171002_1300...,still congress is more than frustrating. i am ...,"[still, congress, is, more, than, frustrating,...","[still, congress, frustrating, ., furious, ., ...","[(still, RB), (congress, JJ), (frustrating, VB...","[(still, RB), (congress, NN), (is, VBZ), (more...","[congress, furious, deadliest, u.s.]","[congress, furious, deadliest]","[0.0, 0.0, 0.0]",0.0


time: 177 ms
