## HTML Extraction From Viki TV Website
**Author: Christopher Elliott**

**Team: KDramaQueen**

**March 24th 2019**

In [1]:
import bs4
from bs4 import BeautifulStoneSoup
import pandas as pd
import pickle
import wikipedia
import numpy as np
import requests
import json
from requests import get
from time import sleep
from itertools import chain

In [2]:
def extract_data(html):
    """ Extracts show data from viki TV HTML files
    and returns a dictonary with the Show Name as a key
    
    RETURNS: dict 
    """
    #Open File
    File = open(html+".html",encoding="Latin-1")
    soup = bs4.BeautifulSoup(File,'html.parser')
    titles = soup.find_all('a',"thumb-title strong")
    reviews = soup.find_all('span',"review-rating")
    num_reviews = soup.find_all("span", "caption info")
    
    if html == '32':
        del(titles[10])
    
    assert len(reviews) == len(titles)
    title_names ={}
    review = []
    
    #Data Extraction
    for i in range(0,len(titles)):
        # Title Extraction
        x = str(titles[i])
        t = x.find("</span>")
        x = x[t+len("</span>"):]
        n = x.find("\n")
        x = x[:n]
    
        # Review Extraction
        r = str(reviews[i])
        e = r.find('<span class="review-rating">')
        r=r[e+len('<span class="review-rating">'):]
        r = r[:3]
        r = float(r)
        
        # Number of Reviews Extraction
        b = num_reviews[i]
        b = str(b)
        k = b.find(">")
        b = b[k+2:]
        s = b.find(" ")
        b = b[:s]
        b= int(b)
    
        # Data Extraction
        title_names[x] = r,b
    File.close()
    return title_names

In [3]:
def build_dataframe():
    """Builds a pandas dataframe using the 
    HTML files in the directory and pickles the dataframe to the disk
    
    RETURNS: DataFrame
    NOTE: Will Create a file on local drive
    """
    print("Building Dataframe Part 1...")
    print("initializaing...")
    data = {}
    for i in range(0,38):
        if i%10==0:
            print('building ...') 
        i = str(i)
        data.update(extract_data(i))
            
    df = pd.DataFrame.from_dict(data,orient='index')
    df = df.reset_index()
    df.columns = ['Show','Viki Rating','Viki n_reviews']
    print("Saving Data Structure to Drive...")
    file = "Viki_HTML_data.pkl"
    pickle.dump(df,open(file,'wb'))   
    print("Complete!")
    return df

In [4]:
def build_korean_drama_data():
    data = {}
    for i in range(1,186):
        if i%25 == 0:
            print(int((i/186)*100),"Percent Complete")
        web = "https://www.koreandrama.org/page/"+str(i)
        webresponse = get(web)
        soup = bs4.BeautifulSoup(webresponse.text,'html.parser')
        posts = soup.find_all('div',"post")
        
        for i in posts:
            string = str(i)
            title_index = string.find("Title: ")
            string = string[title_index+len("Title: "):]
            title_index1a = string.find("/")
            string = string[title_index1a+1:]
            title_index2 = string.find('<')
            title = string[1:title_index2]
            title = title.strip()
            
            genre_index = string.find("Genre:")
            
            if genre_index>-1:
                string = string[genre_index+len("Genre:")+1:]
                genre_index2 = string.find("<")
                genre = string[:genre_index2].replace(" ","")
                genre = genre.split(",")
            else:
                genre = ["NaN"]
            
            net_index = string.find("Broadcast network: ")
            
            if net_index>-1:
                string = string[net_index+len("Broadcast network: "):]
                net_index2 = string.find("<")
                network = string[:net_index2]
            else:
                network = "NaN"
                
            data[title] = genre,network
            
    df = pd.DataFrame.from_dict(data,orient='index')
    df = df.reset_index()
    df.columns = ['Title','Genre','Network']
    df = df.replace('www.koreandrama.org/me-ri-dae-gus-attack-and-defense-battle/" rel="bookmark" title="Permanent Link to Me Ri, Dae Gu’s Attack and Defense Battle"> Me Ri, Dae Gu’s Attack and Defense Battle',"Me Ri, Dae Gu’s Attack and Defense Battle")
    df = df.replace('www.koreandrama.org/how-much-you-like-it/" rel="bookmark" title="Permanent Link to How Much You Like It"> How Much You Like It',"How Much You Like It")
    df = df.replace('www.koreandrama.org/golden-apple/" rel="bookmark" title="Permanent Link to Golden Apple"> Golden Apple'
,"Golden Apple")
    df = df.replace('프렌즈 / Friends',"Friends")
    df = df.loc[(df['Title']<'www.koreandrama.org/2018-sbs-drama-awards-winn')]
    df = df.replace("NaN",np.NaN)
            
    print("Saving Data Structure to Drive...")
    file = "KDrama_HTML_data.pkl"
    pickle.dump(df,open(file,'wb'))   
    print("Complete!")
    return df
     

In [5]:
def get_synopsis(data):
    dat = {}
    count = 0
    for i in data["Title"]:
        if count % 100 == 0:
            print(int((count/len(data['Title']))*100),"Percent Complete")
        j= i.replace(" ","-")
        web = "https://www.koreandrama.org/"+j+"/"
        try:
            webresponse = get(web)
        except:
            webresponse = get(web)
        soup = bs4.BeautifulSoup(webresponse.text,'html.parser')
        entry =soup.find_all('div',"entrytext")
        string = str(entry)
        n = len("Synopsis</strong></p><p>")
        m = len("</p><p><strong>Cast</strong>")
        index = string.find("Synopsis</strong></p><p>")
        lindex = string.find("</p><p><strong>Cast</strong>")
        string = string[index+n:lindex]
        dat[i] = string
        count +=1
        
    df = pd.DataFrame.from_dict(dat,orient='index')
    df = df.reset_index()
    df.replace('NaN',np.NaN)
    df.columns = ['Title','Summary']
    
    print("Saving Data Structure to Drive...")
    file = "KDData.pkl"
    pickle.dump(df,open(file,'wb'))   
    return df

In [16]:
def build_wiki_data(data):
    print("Building DataFrame Pt2")
    print("Starting...")
    t = {}
    count = 0
    d = len(data['Title'])
    for i in data['Title']:
        if count%100 == 0:
            print(str(count) + " done out of " +str(d))
        try:
            wiki =wikipedia.page(i+" South Korea")
            t[i] = wiki.summary,wiki.categories
        except:
            t[i]="NaN","NaN"
        count +=1
    
    df = pd.DataFrame.from_dict(t,orient='index')
    df = df.reset_index()
    df.replace('NaN',np.NaN)
    df.columns = ['Titles','Wiki Summary','Wiki Tags']
    
    print("Saving Data Structure to Drive...")
    file = "Wiki_data.pkl"
    pickle.dump(df,open(file,'wb'))   
    return df

In [7]:
data = build_korean_drama_data()

13 Percent Complete
26 Percent Complete
40 Percent Complete
53 Percent Complete
67 Percent Complete
80 Percent Complete
94 Percent Complete
Saving Data Structure to Drive...
Complete!


In [8]:
synopsis = get_synopsis(data)

0 Percent Complete
6 Percent Complete
13 Percent Complete
20 Percent Complete
27 Percent Complete
34 Percent Complete
41 Percent Complete
48 Percent Complete
54 Percent Complete
61 Percent Complete
68 Percent Complete
75 Percent Complete
82 Percent Complete
89 Percent Complete
96 Percent Complete
Saving Data Structure to Drive...


In [30]:
synopsis
data
df1 = pd.merge(data, synopsis, on='Title')
df1

Unnamed: 0,Title,Genre,Network,Summary
0,My First First Love,"[Youth, Romance]",Netflix,"Due to various personal reasons, a group of Yu..."
1,Her Private Life,"[Romance, Comedy]",tvN,This drama is based on novel <em>“누나팬닷컴 / Nuna...
2,Special Labor Inspector Jo,"[Action, Comedy]",MBC,It follows the story of a civil servant who is...
3,Beautiful World,"[Melodrama, Family]",jTBC,This drama tells the story of a boy who become...
4,My Fellow Citizens,"[Crime, Comedy]",KBS2,"This drama tells the story of a con man, who g..."
5,The Banker,[Drama],MBC,This drama is based on the Japanese manga <em>...
6,Eulachacha Waikiki (Season 2),[Comedy],jTBC,"This drama depicts the friendships, love and d..."
7,Mother of Mine,[Family],KBS2,This drama is about a story of a mother and th...
8,Haechi,"[Historical, Action]",SBS,"Set during the Joseon Dynasty period, the seri..."
9,Item,"[Fantasy, Mystery, Crime]",MBC,This drama is based on the popular webtoon of ...


In [17]:
wiki = build_wiki_data(df)

Building DataFrame Pt2
Starting...
0 done out of 1457
100 done out of 1457
200 done out of 1457
300 done out of 1457
400 done out of 1457
500 done out of 1457
600 done out of 1457
700 done out of 1457
800 done out of 1457
900 done out of 1457
1000 done out of 1457
1100 done out of 1457
1200 done out of 1457
1300 done out of 1457
1400 done out of 1457
Saving Data Structure to Drive...


In [31]:
wiki = wiki.rename(columns={'Titles': 'Title'})
df = pd.merge(wiki, df1, on='Title')

In [36]:
df.to_csv(r"C:\Users\Chris\Desktop\Viki Tv HTML Extraction\Data_Set_New.csv")

In [50]:
df = pd.DataFrame.from_csv("Data_Set_New.csv")

  """Entry point for launching an IPython kernel.
