DataTable Handling

In [None]:
'''
    + Tabular data: Data that is organized in a table with rows and columns. We can think
of it as a 2D array, list of lists...
    + Rows (records, samples): Observation (cases)
    + Columns (fields, features): Attributes of observations
- How to interact tabular data in Python ?
    + Pandas library: A fast, powerful and easy to use open source data analysis and
manipulation tool
'''
import pandas as pd

# Read .csv file (read_csv() returns a DataFrame)
url = "https://raw.githubusercontent.com/justmarkham/scikit-learn-videos/master/data/Advertising.csv"
advertising_df = pd.read_csv(url, index_col=0)


# Convert DataFrame to Python List of Lists
# Currently, we don't often use DataFrame -> Convert to List of Lists
advertising_list = advertising_df.values.tolist()


# Action with list
tv_sum = sum([lst[0] for lst in advertising_list])
radio_sum = sum([lst[1] for lst in advertising_list])
newspaper_sum = sum([lst[2] for lst in advertising_list])
sales_sum = sum([lst[3] for lst in advertising_list])

# Find sum of columns from row 2 to row 11
start_idx = 1
end_idx = 11
sliced_list = advertising_list[start_idx:end_idx]
tv_sum = sum([lst[0] for lst in sliced_list])
radio_sum = sum([lst[1] for lst in sliced_list])
newspaper_sum = sum([lst[2] for lst in sliced_list])
sales_sum = sum([lst[3] for lst in sliced_list])

# Find median
def median(lst):
    sorted_lst = sorted(lst)
    n = len(lst)
    mid = n // 2
    if n % 2 == 0:
        median = (sorted_lst[mid] + sorted_lst[mid - 1]) / 2
    else:
        median = sorted_lst[mid] 
    return median

tv_median = median([lst[0] for lst in advertising_list])
radio_median = median([lst[1] for lst in advertising_list])
newspaper_median = median([lst[2] for lst in advertising_list])
sales_median = median([lst[3] for lst in advertising_list])

# Find contribution percentage
tv_lst = [lst[0] for lst in advertising_list]
radio_lst = [lst[1] for lst in advertising_list]
newspaper_lst = [lst[2] for lst in advertising_list]

total_budget = sum(tv_lst) + sum(radio_lst) + sum(newspaper_lst)

tv_percentage = (sum(tv_lst) / total_budget) * 100
radio_percentage = (sum(radio_lst) / total_budget) * 100
newspaper_percentage = (sum(newspaper_lst) / total_budget) * 100

print(f'TV Percentage: {tv_percentage:.3f}%')
print(f'Radio Percentage: {radio_percentage:.3f}%')
print(f'Newspaper Percentage: {newspaper_percentage:.3f}%')


TV Percentage: 73.206%
Radio Percentage: 11.582%
Newspaper Percentage: 15.212%


Text Representation

In [2]:
'''
- In general, we don't often use raw text when computing in some tasks -> Need a better
representation for text
- Index-based Encoding:
    + Use a dictionary with key = word and value = index to transform text -> list
    + Step 1: Corpus (List of paragraphs) -> Text Normalization -> Create Dictionary
    + Step 2: A string -> Text Normalization -> Vectorize (+ Dictionary) -> New representation
'''

'''
Text Normalization Introduction
- Problem: 
    + Documents contain unnecessary string (information)
    + Not well-present natural language
'''

# Text Normalization: Lowercasing
import string
# remove_characters sẽ chứa một chuỗi dài gồm các kỳ tự Tab, dấu ngoặc kép và tất cả dấu câu
remove_characters = '\t""' + string.punctuation
def text_normalization(text):
    text = text.lower()
    text = text.strip()
    text = text.replace('\n',' ')
    for char in remove_characters:
        text = text.replace(char, '')
    return text

# Create Dictionary
# Given a list of paragraphs -> Get a list of unique words
def create_dictionary(corpus):
    dictionary = []
    for paragraph in corpus:
        paragraph = text_normalization(paragraph)
        tokens = paragraph.split()
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)
    return dictionary

# Create New Text Representation
def vectorize(text, dictionary, unknown_token_id):
    text = text_normalization(text)
    tokens = text.split()
    vector = [
        dictionary.index(token) 
            if token in dictionary else unknown_token_id 
                for token in tokens
    ]
    return vector


In [3]:
corpus = [
    "Artificial Intelligence is FASCINATING!!!",
    "Python\tis\ta\tgreat\tlanguage.", # Chứa tab
    "Data Science: The \n'Sexiest' Job of the 21st Century.",
    "Hello, world! Are you ready for AI?",
    "Natural Language Processing (NLP) includes: tokenization, stemming, & lemmatization.",
    "   Machine Learning... requires    math!   "
]
cleaned_corpus = [text_normalization(text) for text in corpus]
for i, sentence in enumerate(cleaned_corpus):
    print(f"Original: {repr(corpus[i])}")
    print(f"Cleaned : {sentence}")
    print("-" * 30)

a_dict = create_dictionary(corpus)

unknown_id = -1
print(a_dict)
text_1 = "Python is fascinating"
vector_1 = vectorize(text_1, a_dict, unknown_id)
print(f"Text: {text_1}")
print(f"Vector: {vector_1}")

Original: 'Artificial Intelligence is FASCINATING!!!'
Cleaned : artificial intelligence is fascinating
------------------------------
Original: 'Python\tis\ta\tgreat\tlanguage.'
Cleaned : pythonisagreatlanguage
------------------------------
Original: "Data Science: The \n'Sexiest' Job of the 21st Century."
Cleaned : data science the  sexiest job of the 21st century
------------------------------
Original: 'Hello, world! Are you ready for AI?'
Cleaned : hello world are you ready for ai
------------------------------
Original: 'Natural Language Processing (NLP) includes: tokenization, stemming, & lemmatization.'
Cleaned : natural language processing nlp includes tokenization stemming  lemmatization
------------------------------
Original: '   Machine Learning... requires    math!   '
Cleaned : machine learning requires    math
------------------------------
['artificial', 'intelligence', 'is', 'fascinating', 'pythonisagreatlanguage', 'data', 'science', 'the', 'sexiest', 'job', 'of', '21

Data Crawling

In [None]:
'''
- Introduction: 
    + We need dataset
=> Problem:
    + Not enough data
    + New class that doesn't have dataset
    + Improve performance
=> Solution:
    + We collect more data
=> Webpages -> Data Crawling / Web Scraping (Tools/Programs that collecting data from webpages) 
'''

'''
Motivation:
- A webpage's content is represent is something called HTML
=> General steps to crawling
URL -> Web Crawler App -> Log file

- Web Crawler App:
    + Request -> Server -> HTML -> Link Extraction App -> URL List

- HTML: The standard markup language for documents designed to be displayed in a web browser
'''

'''
- Selenium Package: Used to automate web browser interaction from Python
- Things to extract: Article, Author, Abstract, Body
- Get an article URL -> Crawler -> Extraction data
'''

In [9]:
# Selenium Example
# Step 1: Initialize a browser and access to the website
import random
import pandas as pd
import os
import time
import re
import requests

from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(
    service = Service(ChromeDriverManager().install()),
    options = chrome_options
)

url = 'https://www.python.org/'
driver.get(url)
# print(driver.page_source)



In [None]:
'''
Program pipeline:
1. Create a browser
2. Request HTML from table website
3. Get list of articles
4. Request HTML from article
5. Extract content
6. Save content and move to next article
'''
