# -----------------------------------------------------

#             Text Processing Methods

# -----------------------------------------------------


In [68]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import warnings 

In [None]:
## --------------------------------------------------------------------------------------------------------------------



    """ How do we organize string data in the Natural Language Processing process? 

                                  I would like to share with you a few methods related to this subject. """


## --------------------------------------------------------------------------------------------------------------------



In [3]:

 
# Bitcoin Article    
    
article =   """With the price of a bitcoin surging to new highs in 2017, the bullish case for investors might seem so obvious it does not need stating. 
Alternatively it may seem foolish to invest in a digital asset that isn’t backed by any commodity or government and whose price rise has prompted some to compare it to the tulip mania or the dot-com bubble. 
Neither is true; the bullish case for Bitcoin is compelling but far from obvious. There are significant risks to investing in Bitcoin, but, as I will argue, there is still an immense opportunity.
Never in the history of the world had it been possible to transfer value between distant peoples without relying on a trusted intermediary, such as a bank or government. 
In 2008 Satoshi Nakamoto, whose identity is still unknown, published a 9 page solution to a long-standing problem of computer science known as the Byzantine General’s Problem. 
Nakamoto’s solution and the system he built from it — Bitcoin — allowed, for the first time ever, value to be quickly transferred, at great distance, in a completely trustless way. 
The ramifications of the creation of Bitcoin are so profound for both economics and computer science that Nakamoto should rightly be the first person to qualify for both a Nobel prize in Economics and the Turing award.
For an investor the salient fact of the invention of Bitcoin is the creation of a new scarce digital good — bitcoins. 
Bitcoins are transferable digital tokens that are created on the Bitcoin network in a process known as “mining”. 
Bitcoin mining is roughly analogous to gold mining except that production follows a designed, predictable schedule. 
By design, only 21 million bitcoins will ever be mined and most of these already have been — approximately 16.8 million bitcoins have been mined at the time of writing. 
Every four years the number of bitcoins produced by mining halves and the production of new bitcoins will end completely by the year 2140."""




In [8]:
article.split("\n")    # We separate the string statements inside the article variable with the ".split()" function.

['With the price of a bitcoin surging to new highs in 2017, the bullish case for investors might seem so obvious it does not need stating. ',
 'Alternatively it may seem foolish to invest in a digital asset that isn’t backed by any commodity or government and whose price rise has prompted some to compare it to the tulip mania or the dot-com bubble. ',
 'Neither is true; the bullish case for Bitcoin is compelling but far from obvious. There are significant risks to investing in Bitcoin, but, as I will argue, there is still an immense opportunity.',
 'Never in the history of the world had it been possible to transfer value between distant peoples without relying on a trusted intermediary, such as a bank or government. ',
 'In 2008 Satoshi Nakamoto, whose identity is still unknown, published a 9 page solution to a long-standing problem of computer science known as the Byzantine General’s Problem. ',
 'Nakamoto’s solution and the system he built from it — Bitcoin — allowed, for the first t

In [10]:
string_series = pd.Series(article.split("\n"))    # Pandas Series Form
string_series

0     With the price of a bitcoin surging to new hig...
1     Alternatively it may seem foolish to invest in...
2     Neither is true; the bullish case for Bitcoin ...
3     Never in the history of the world had it been ...
4     In 2008 Satoshi Nakamoto, whose identity is st...
5     Nakamoto’s solution and the system he built fr...
6     The ramifications of the creation of Bitcoin a...
7     For an investor the salient fact of the invent...
8     Bitcoins are transferable digital tokens that ...
9     Bitcoin mining is roughly analogous to gold mi...
10    By design, only 21 million bitcoins will ever ...
11    Every four years the number of bitcoins produc...
dtype: object

In [11]:
article_df = pd.DataFrame(string_series, columns = ["Bitcoin"])    # DataFrame Form
article_df

Unnamed: 0,Bitcoin
0,With the price of a bitcoin surging to new hig...
1,Alternatively it may seem foolish to invest in...
2,Neither is true; the bullish case for Bitcoin ...
3,Never in the history of the world had it been ...
4,"In 2008 Satoshi Nakamoto, whose identity is st..."
5,Nakamoto’s solution and the system he built fr...
6,The ramifications of the creation of Bitcoin a...
7,For an investor the salient fact of the invent...
8,Bitcoins are transferable digital tokens that ...
9,Bitcoin mining is roughly analogous to gold mi...


## -----------------------------------------------------------
 
## String-Lower Process

## -----------------------------------------------------------


In [22]:
lower_process = article_df["Bitcoin"].apply(lambda x: " ".join(x.lower() for x in x.split()))  


# The sentences in each line are split with the ".split ()" function and put into a list.

# With the ".lower ()" function, the initial letter of each word in the list is reduced and the words in the list are combined with the last ".join ()" function.


lower_process

0     with the price of a bitcoin surging to new hig...
1     alternatively it may seem foolish to invest in...
2     neither is true; the bullish case for bitcoin ...
3     never in the history of the world had it been ...
4     in 2008 satoshi nakamoto, whose identity is st...
5     nakamoto’s solution and the system he built fr...
6     the ramifications of the creation of bitcoin a...
7     for an investor the salient fact of the invent...
8     bitcoins are transferable digital tokens that ...
9     bitcoin mining is roughly analogous to gold mi...
10    by design, only 21 million bitcoins will ever ...
11    every four years the number of bitcoins produc...
Name: Bitcoin, dtype: object

In [19]:
article_df = pd.DataFrame(lower_process, columns = ["Bitcoin"])   # DataFrame Form
article_df

Unnamed: 0,Bitcoin
0,with the price of a bitcoin surging to new hig...
1,alternatively it may seem foolish to invest in...
2,neither is true; the bullish case for bitcoin ...
3,never in the history of the world had it been ...
4,"in 2008 satoshi nakamoto, whose identity is st..."
5,nakamoto’s solution and the system he built fr...
6,the ramifications of the creation of bitcoin a...
7,for an investor the salient fact of the invent...
8,bitcoins are transferable digital tokens that ...
9,bitcoin mining is roughly analogous to gold mi...


## -----------------------------------------------------------

## Deleting Characters Such As Commas and Hyphens

## -----------------------------------------------------------


In [24]:
commas_process = article_df["Bitcoin"].str.replace("[^\w\s]","")   # Marks such as commas and dashes are replaced with spaces.
commas_process

0     with the price of a bitcoin surging to new hig...
1     alternatively it may seem foolish to invest in...
2     neither is true the bullish case for bitcoin i...
3     never in the history of the world had it been ...
4     in 2008 satoshi nakamoto whose identity is sti...
5     nakamotos solution and the system he built fro...
6     the ramifications of the creation of bitcoin a...
7     for an investor the salient fact of the invent...
8     bitcoins are transferable digital tokens that ...
9     bitcoin mining is roughly analogous to gold mi...
10    by design only 21 million bitcoins will ever b...
11    every four years the number of bitcoins produc...
Name: Bitcoin, dtype: object

In [25]:
article_df = pd.DataFrame(commas_process, columns = ["Bitcoin"])   # DataFrame Form
article_df

Unnamed: 0,Bitcoin
0,with the price of a bitcoin surging to new hig...
1,alternatively it may seem foolish to invest in...
2,neither is true the bullish case for bitcoin i...
3,never in the history of the world had it been ...
4,in 2008 satoshi nakamoto whose identity is sti...
5,nakamotos solution and the system he built fro...
6,the ramifications of the creation of bitcoin a...
7,for an investor the salient fact of the invent...
8,bitcoins are transferable digital tokens that ...
9,bitcoin mining is roughly analogous to gold mi...


## -----------------------------------------------------------


## Deleting Numbers

## -----------------------------------------------------------


In [27]:

deleting_process = article_df["Bitcoin"].str.replace("\d","")    # Numbers Inside Each Row Deleted
deleting_process

0     with the price of a bitcoin surging to new hig...
1     alternatively it may seem foolish to invest in...
2     neither is true the bullish case for bitcoin i...
3     never in the history of the world had it been ...
4     in  satoshi nakamoto whose identity is still u...
5     nakamotos solution and the system he built fro...
6     the ramifications of the creation of bitcoin a...
7     for an investor the salient fact of the invent...
8     bitcoins are transferable digital tokens that ...
9     bitcoin mining is roughly analogous to gold mi...
10    by design only  million bitcoins will ever be ...
11    every four years the number of bitcoins produc...
Name: Bitcoin, dtype: object

In [32]:
article_df = pd.DataFrame(deleting_process, columns = ["Bitcoin"])    # DataFrame Form
article_df

Unnamed: 0,Bitcoin
0,with the price of a bitcoin surging to new hig...
1,alternatively it may seem foolish to invest in...
2,neither is true the bullish case for bitcoin i...
3,never in the history of the world had it been ...
4,in satoshi nakamoto whose identity is still u...
5,nakamotos solution and the system he built fro...
6,the ramifications of the creation of bitcoin a...
7,for an investor the salient fact of the invent...
8,bitcoins are transferable digital tokens that ...
9,bitcoin mining is roughly analogous to gold mi...


## -----------------------------------------------------------


## Deleting Stop Words

## -----------------------------------------------------------


In [31]:


words = stopwords.words("english")    # English Language Stopwords
words[:10]



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [38]:
stopwords_process = article_df["Bitcoin"].apply(lambda x: " ".join(x for x in x.split() if x not in words))
stopwords_process

0     price bitcoin surging new highs bullish case i...
1     alternatively may seem foolish invest digital ...
2     neither true bullish case bitcoin compelling f...
3     never history world possible transfer value di...
4     satoshi nakamoto whose identity still unknown ...
5     nakamotos solution system built bitcoin allowe...
6     ramifications creation bitcoin profound econom...
7     investor salient fact invention bitcoin creati...
8     bitcoins transferable digital tokens created b...
9     bitcoin mining roughly analogous gold mining e...
10    design million bitcoins ever mined already app...
11    every four years number bitcoins produced mini...
Name: Bitcoin, dtype: object

In [39]:
article_df = pd.DataFrame(stopwords_process, columns = ["Bitcoin"])   # DataFrame Form
article_df

Unnamed: 0,Bitcoin
0,price bitcoin surging new highs bullish case i...
1,alternatively may seem foolish invest digital ...
2,neither true bullish case bitcoin compelling f...
3,never history world possible transfer value di...
4,satoshi nakamoto whose identity still unknown ...
5,nakamotos solution system built bitcoin allowe...
6,ramifications creation bitcoin profound econom...
7,investor salient fact invention bitcoin creati...
8,bitcoins transferable digital tokens created b...
9,bitcoin mining roughly analogous gold mining e...


## -----------------------------------------------------------


## Deleting Underused Words

## -----------------------------------------------------------


In [54]:
underused_words = pd.Series((" ".join(article_df["Bitcoin"])).split()).value_counts()    # Frequently Used Words
underused_words

bitcoin        216
bitcoins       148
digital         77
mining          74
economics       74
              ... 
analogous       16
designed        16
predictable     16
roughly         16
follows         16
Length: 130, dtype: int64

In [58]:
underused_words[-30:]   # Least Used First 30 Words

surging         26
highs           26
investors       26
four            24
year            24
number          24
produced        24
years           24
every           24
end             24
halves          24
invention       22
good            22
salient         22
investor        22
scarce          22
fact            22
network         18
tokens          18
transferable    18
process         18
created         18
gold            16
except          16
schedule        16
analogous       16
designed        16
predictable     16
roughly         16
follows         16
dtype: int64

In [66]:
deleting_process = article_df["Bitcoin"].apply(lambda x: " ".join(x for x in x.split() if x not in underused_words.index[-30:]))

deleting_process          # We have deleted unused words from the list by binding them to the condition.

0     price bitcoin new bullish case might seem obvi...
1     alternatively may seem foolish invest digital ...
2     neither true bullish case bitcoin compelling f...
3     never history world possible transfer value di...
4     satoshi nakamoto whose identity still unknown ...
5     nakamotos solution system built bitcoin allowe...
6     ramifications creation bitcoin profound econom...
7     bitcoin creation new digital bitcoins bitcoin ...
8     bitcoins digital bitcoin known mining bitcoins...
9     bitcoin mining mining production bitcoin minin...
10    design million bitcoins ever mined already app...
11    bitcoins mining production new bitcoins comple...
Name: Bitcoin, dtype: object

In [67]:
article_df = pd.DataFrame(deleting_process, columns = ["Bitcoin"])    # DataFrame Form
article_df  

Unnamed: 0,Bitcoin
0,price bitcoin new bullish case might seem obvi...
1,alternatively may seem foolish invest digital ...
2,neither true bullish case bitcoin compelling f...
3,never history world possible transfer value di...
4,satoshi nakamoto whose identity still unknown ...
5,nakamotos solution system built bitcoin allowe...
6,ramifications creation bitcoin profound econom...
7,bitcoin creation new digital bitcoins bitcoin ...
8,bitcoins digital bitcoin known mining bitcoins...
9,bitcoin mining mining production bitcoin minin...
