# Natural Language Processing Workflow(1)-Get and Clean Data

## 1. Getting the Data

In [None]:
# Web scarping
import requests
from bs4 import BeautifulSoup

# eg. scarps transcript data from website
def url_to_transcript(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'lxml')
    text = [p.text for p in soup.find(class_ = 'post-content').find_all('p')]
    return text
# It should be noticed that text is a list, which mays paragraphy is in list format.

In [None]:
# eg. if has a list of urls, we can get the transcripts using the list comprehension
urls = ['...', '...', ...]
transcripts = [url_to_transcript(u) for u in urls]

# eg. another list to store the name of actors
actors = ['...', '...',...]

In [None]:
# pickle 
import pickle
for i, c in enumerate(actors):
    with open('transcripts/' + c + '.txt', 'wb') as file:
        pickle.dump(transcripts[i], file)
        
# Load pickled files
data = {}
for i, c in enumerate(actors):
    with open('transcripts/' + c + '.txt', 'rb') as file:
        data[c] = pickle.load(file)

# Double check to make sure data has been loaded properly
data.keys()                        

## 2. Cleaning the data

## Data Cleaning
1. Getting the data - How to scrap data from website
2. Cleaning the data - text preprocessing 
3. Organizing the data - two standard text formats:     
    a) **Corpus**: A collection of text;     
    b) **Document Term Matrix**: word counts in matrix format

#### Common data cleaning steps on all text:##
- Make all lower case
- Remove punctuation
- Remove numerical values
- Remove common non-sensical text
- Tokenization
- Remove stop words
...

Currently, the data is in the format: keys:actor_name, values:list of text format. Now let's change the values from text to string format

In [None]:
# change the text to be string format
def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

data_combined = {key:[combine_text(value)] for (key, value) in data.items()}

In [None]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']

In [10]:
# *This is a simple example to show it looks like after transfer to DataFrame format*
import pandas as pd
a={'jim':["he is a good guy"],'mary':["she is a good girl"]}
pd.DataFrame.from_dict(a)

Unnamed: 0,jim,mary
0,he is a good guy,she is a good girl


In [None]:
# Apply the first round of data cleaning
import re
import string

def clean_text_round1(text):
    # make lowercase, 
    # remove text in square bracket, 
    # remove punctuation and words containing numbers
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

data_clean = pd.DataFrame(data_df.transcript.apply(lambda x:clean_text_round1(x)))
data_clean

In [None]:
# pickle the cleaned data for later use
data_clean.to_pickle('data_clean.pkl')

**Note:** If the cleaning results need to be further impruved, we can define a second or third round of cleaning function. This process is iterated and need to be processed several times until be satisfied.

## 3. Organizing the Data

Corpus is a collection of texts, and are put together neatly in a pandas dataframe.

In [None]:
# pickle the corpus for later use
data_df.to_pickle('corpus.pkl')

The most common tokenization techniques is to break down text into words. We can create a document-term-matrix(DTM) using CountVectorize.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words = 'english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns = cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

In [None]:
# pickle the document-term-matrix(DTM) for later use
data_dtm.to_pickle('dtm.pkl')