# Count tokens

In [77]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [78]:
def tokenizer(s):
    return [xx.strip('()[];-\'`.,') for xx in s.split(' ') if xx.strip() != '']

In [79]:
df = pd.read_csv('./speeches.csv')
df.dropna(inplace=True)

In [80]:
df.head()

Unnamed: 0,date,speaker,text
0,debates2010-01-05a.xml,Philip Hollobone,May I suggest to the Secretary of State that p...
1,debates2010-01-05a.xml,Philip Hollobone,"""If you commit a crime you will be deported fr..."
2,debates2010-01-05a.xml,Jack Straw,I am happy to seek to provide the hon. Gentlem...
3,debates2010-01-05a.xml,Jack Straw,What we want to see is prison made effective. ...
4,debates2010-01-05a.xml,David Miliband,We should certainly have a foreign policy that...


In [82]:
df['dateonly'] = df.date.apply(lambda x: x.lstrip('debates').rstrip('ab.xml'))
df['month'] = df['dateonly'].apply(lambda x: x[:7])

In [83]:
df.head()

Unnamed: 0,date,speaker,text,dateonly,month
0,debates2010-01-05a.xml,Philip Hollobone,May I suggest to the Secretary of State that p...,2010-01-05,2010-01
1,debates2010-01-05a.xml,Philip Hollobone,"""If you commit a crime you will be deported fr...",2010-01-05,2010-01
2,debates2010-01-05a.xml,Jack Straw,I am happy to seek to provide the hon. Gentlem...,2010-01-05,2010-01
3,debates2010-01-05a.xml,Jack Straw,What we want to see is prison made effective. ...,2010-01-05,2010-01
4,debates2010-01-05a.xml,David Miliband,We should certainly have a foreign policy that...,2010-01-05,2010-01


In [84]:
bymonth = df.groupby('month').agg(lambda x: ' '.join(x))

In [85]:
bymonth.head()

Unnamed: 0_level_0,date,speaker,text,dateonly
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01,debates2010-01-05a.xml debates2010-01-05a.xml ...,Philip Hollobone Philip Hollobone Jack Straw J...,May I suggest to the Secretary of State that p...,2010-01-05 2010-01-05 2010-01-05 2010-01-05 20...
2010-02,debates2010-02-01a.xml debates2010-02-01a.xml ...,Eric Illsley Yvette Cooper Eric Illsley Yvette...,What steps she is taking to improve telephone ...,2010-02-01 2010-02-01 2010-02-01 2010-02-01 20...
2010-03,debates2010-03-01a.xml debates2010-03-01a.xml ...,John Grogan Ben Bradshaw Ben Bradshaw John Gro...,When he expects to respond to the David Davies...,2010-03-01 2010-03-01 2010-03-01 2010-03-01 20...
2010-04,debates2010-04-06a.xml debates2010-04-06a.xml ...,Vera Baird Vera Baird Vera Baird Vera Baird Ve...,"With the leave of the House, Mr. Speaker, I sh...",2010-04-06 2010-04-06 2010-04-06 2010-04-06 20...
2010-05,debates2010-05-18a.xml debates2010-05-18a.xml ...,Malcolm Rifkind Malcolm Rifkind Malcolm Rifkin...,"I beg to move, That John Bercow do take the Ch...",2010-05-18 2010-05-18 2010-05-18 2010-05-18 20...


In [91]:
vec = CountVectorizer(ngram_range=(3,3), stop_words='english', tokenizer=tokenizer)
vec = vec.fit(bymonth.text)

In [88]:
def top_n_words(text, vec, n=5, whitelist=None):
    trans = vec.transform([text])
    sortidx = np.argsort(trans.toarray()[0])
    
    sorted_counts = trans.toarray()[0][sortidx]
    sorted_tokens = np.array(vec.get_feature_names())[sortidx]
    
    df = pd.DataFrame(data={
        'token': sorted_tokens,
        'count': sorted_counts
    })
    if whitelist is not None:
        df['interesting'] = df.token.apply(lambda x: any([wl in x for wl in whitelist]))
    else:
        df['interesting'] = 1
    return df[df.interesting & (df['count'] > 0)].sort_values('count', ascending=False)

In [93]:
whitelist = [str(x) for x in range(10)]

In [None]:
top_n_words(bymonth['text'][0], vec, whitelist=whitelist)

# --------------

In [2]:
with open('./1day.txt', 'r') as f:
    txt = f.read()

In [5]:
def tokenizer(s):
    return [xx.strip('()[];-\'`.,') for xx in s.split(' ') if xx.strip() != '']

In [6]:
import pandas as pd

In [57]:
vec = CountVectorizer(ngram_range=(3,5), stop_words='english', tokenizer=tokenizer)
vec = vec.fit([txt])


# Full CSV

In [9]:
df = pd.read_csv('./speeches.csv')
df.dropna(inplace=True)

In [73]:
len(df)

1088983

In [21]:
df['dateonly'] = df.date.apply(lambda x: x.lstrip('debates').rstrip('ab.xml'))
df['month'] = df['dateonly'].str.in

In [23]:
daytext = df.groupby('dateonly').agg(lambda x: ' '.join(x))

In [26]:
daytext.head()

Unnamed: 0_level_0,date,speaker,text
dateonly,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-05,debates2010-01-05a.xml debates2010-01-05a.xml ...,Philip Hollobone Philip Hollobone Jack Straw J...,May I suggest to the Secretary of State that p...
2010-01-06,debates2010-01-06a.xml debates2010-01-06a.xml ...,Hon. Members Hon. Members Albert Owen Albert O...,Object. Second Reading deferred until A happ...
2010-01-07,debates2010-01-07a.xml debates2010-01-07a.xml ...,Frank Dobson Frank Dobson Frank Dobson Frank D...,"I think that we needed that explanation, and I..."
2010-01-07c,debates2010-01-07c.xml debates2010-01-07c.xml ...,Edward Miliband Edward Miliband David Drew Edw...,"Obviously, we will look at any proposals that ..."
2010-01-07d,debates2010-01-07d.xml debates2010-01-07d.xml ...,Hon. Members Hon. Members Hon. Members Hon. Me...,Object. Bill to be read the Third time on Ob...


In [11]:
df.head()

Unnamed: 0,date,speaker,text
0,debates2010-01-05a.xml,Philip Hollobone,May I suggest to the Secretary of State that p...
1,debates2010-01-05a.xml,Philip Hollobone,"""If you commit a crime you will be deported fr..."
2,debates2010-01-05a.xml,Jack Straw,I am happy to seek to provide the hon. Gentlem...
3,debates2010-01-05a.xml,Jack Straw,What we want to see is prison made effective. ...
4,debates2010-01-05a.xml,David Miliband,We should certainly have a foreign policy that...


In [17]:
vec = TfidfVectorizer(ngram_range=(3,3), stop_words='english', tokenizer=tokenizer)
vec = vec.fit(df.text[::10])

In [43]:
important_words = [str(x) for x in range(10)] + ['increas', '%', 'decreas']

In [67]:
top_n_words(daytext['text'][134], ['a', 'e', 'i', 'o', 'u'], vec)

Unnamed: 0,count,token,interesting
45656,51,right hon friend,True
45655,35,right hon gentleman,True
45654,8,hon friend secretary state,True
45653,8,right hon friend secretary,True
45652,8,friend secretary state,True
45651,8,hon friend secretary,True
45650,8,right hon friend secretary state,True
45648,7,previous labour government,True
45649,7,right hon hon,True
45647,5,hon friend agree,True
