# NLP

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
stories = pd.read_csv("sel_hn_stories.csv", header = None, 
                      names = ["sub_time", "upvote", "url", "headlines"])

In [3]:
stories.head()

Unnamed: 0,sub_time,upvote,url,headlines
0,2014-06-24T05:50:40.000Z,1,flux7.com,8 Ways to Use Docker in the Real World
1,2010-02-17T16:57:59Z,1,blog.jonasbandi.net,Software: Sadly we did adopt from the construc...
2,2014-02-04T02:36:30Z,1,blogs.wsj.com,Google’s Stock Split Means More Control for L...
3,2011-10-26T07:11:29Z,1,threatpost.com,SSL DOS attack tool released exploiting negoti...
4,2011-04-03T15:43:44Z,67,algorithm.com.au,Immutability and Blocks Lambdas and Closures


In [4]:
stories.dropna(axis = 0, inplace = True)
stories.reset_index(inplace=True)

In [5]:
punctuation = [",", ":", ";", ".", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")"]

In [6]:
headlines = stories["headlines"]

In [7]:
headlines.head()

0               8 Ways to Use Docker in the Real World
1    Software: Sadly we did adopt from the construc...
2     Google’s Stock Split Means More Control for L...
3    SSL DOS attack tool released exploiting negoti...
4         Immutability and Blocks Lambdas and Closures
Name: headlines, dtype: object

In [8]:
headlines = headlines.str.lower()

In [9]:
headlines.head()

0               8 ways to use docker in the real world
1    software: sadly we did adopt from the construc...
2     google’s stock split means more control for l...
3    ssl dos attack tool released exploiting negoti...
4         immutability and blocks lambdas and closures
Name: headlines, dtype: object

In [10]:
"Muhammad Waqas".split()

['Muhammad', 'Waqas']

In [11]:
tokens = []

for i in headlines:
    tokens.append(i.split())

In [12]:
print(tokens)



In [13]:
# nested loop

lst = []
for i in tokens:
    clean = []
    for j in i:
        if j not in punctuation:
            clean.append(j)
    lst.append(clean)

print(lst)



In [14]:
import string

In [15]:
dir(string)

['Formatter',
 'Template',
 '_ChainMap',
 '_TemplateMetaclass',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_re',
 '_sentinel_dict',
 '_string',
 'ascii_letters',
 'ascii_lowercase',
 'ascii_uppercase',
 'capwords',
 'digits',
 'hexdigits',
 'octdigits',
 'printable',
 'punctuation',
 'whitespace']

In [16]:
unique = []

for i in lst:
    for j in i:
        unique.append(j)

print(unique[:10])

['8', 'ways', 'to', 'use', 'docker', 'in', 'the', 'real', 'world', 'software:']


In [17]:
for i in punctuation:
    print(i in unique)

False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [18]:
nlp = pd.DataFrame(data = 0, index = headlines.index, columns = unique)

In [19]:
nlp.head()

Unnamed: 0,8,ways,to,use,docker,in,the,real,world,software:,...,with,python,crowdsourcing,disaster,response,what,we,learn,from,japan
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
for i, lst1 in enumerate(lst):
    for token in lst1:
        nlp.loc[i, token] += 1

In [21]:
nlp

Unnamed: 0,8,ways,to,use,docker,in,the,real,world,software:,...,with,python,crowdsourcing,disaster,response,what,we,learn,from,japan
0,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2796,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2797,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2799,0,0,1,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [22]:
count = nlp.sum(axis=0)
count

8          18
ways       12
to        476
use        24
docker      2
         ... 
what       62
we         27
learn       7
from       80
japan       3
Length: 20235, dtype: int64

In [23]:
boolean = (count > 5) & (count < 100)
boolean

8          True
ways       True
to        False
use        True
docker    False
          ...  
what       True
we         True
learn      True
from       True
japan     False
Length: 20235, dtype: bool

In [24]:
col = count[boolean].index
col

Index(['8', 'ways', 'use', 'real', 'world', 'we', 'from', 'more', 'control',
       'tool',
       ...
       'have', 'an', 'next', 'year', 'dropbox', 'python', 'what', 'we',
       'learn', 'from'],
      dtype='object', length=6771)

In [25]:
nlp.loc[:,col]

Unnamed: 0,8,8.1,8.2,8.3,8.4,8.5,8.6,8.7,8.8,8.9,...,from,from.1,from.2,from.3,from.4,from.5,from.6,from.7,from.8,from.9
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2797,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2799,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
nlp.columns.value_counts()

the              599
to               476
of               340
a                334
for              297
                ... 
marvel             1
determination      1
survivable         1
jarvis-like        1
tycoon             1
Length: 7610, dtype: int64

In [27]:
nlp.shape

(2801, 20235)