In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np
import json

### Get all movie links
- Get web link for movies from 2000 - 2009, and 2010 - 2019
- Get each movie link

In [2]:
class Links:
    def __init__(self, link):
#       Web scrapping initial setups
        self.link = requests.get(link)
        self.soup = bs(self.link.content)
        
    def get_main_links(self):
        ul = self.soup.find_all('ul')[1]
        a = ul.find_all('a')

        links = list()
        years = ['2000', '2010']

        for item in a:
            for year in years:
                if year in item.get('href'):
                    links.append(item.get('href'))
        
        return links
                    
          
    def get_movie_links(self):
        main_links = self.get_main_links()
        movies_links = list()

        for link in main_links:
            req = requests.get(f"https://en.wikipedia.org/{link}")
            content = bs(req.content)

            table = content.find('tbody')
            i = table.find_all('i')
            
            for item in i:
                a = item.find('a')
                if a != None:
                    movies_links.append(a.get('href'))
                else:
                    continue
            
        return movies_links

In [3]:
links = Links("https://en.wikipedia.org/wiki/Lists_of_Lionsgate_films")
all_links = links.get_movie_links()

### Get movie info
- Get each movie information and store it into a dictionary
- Remove tags

In [460]:
class Data:
    def __init__(self, urls):
        self.urls = urls
        self.base_path = "https://en.wikipedia.org/"
        
    def remove_tags(self, content):
        for tag1, tag2 in zip(content.find_all('sup', attrs={'class':'reference'}), content.find_all('span')):
            tag1.decompose()
            tag2.decompose()
        
    def get_text(self, val):
        li = val.find_all('li')
        if li:
            val_context = [i.get_text(' ', strip=True).replace("\xa0", ' ') for i in li]
            return val_context
        elif val.find('br'):
            val_context = [text for text in val.stripped_strings]
            return val_context
        else:
            val_context = val.get_text(' ', strip=True).replace('\xa0', ' ')
            return val_context
        
    def get_data(self): 
        movie_list = {'movie_info':[]}
        
        for url in self.urls:
            req = requests.get(self.base_path + url)
            content = bs(req.content)
            
            self.remove_tags(content)
            
            data = dict()
            keys = content.select('table.infobox.vevent tbody tr th')
            for index, key in enumerate(keys):
                if index == 0:
                    data['Title'] = key.get_text()
                else:
                    val = key.next_sibling
                    if val != None:
                        val_context = self.get_text(val)
                        data[key.get_text(" ", strip=True)] = val_context
                        
            movie_list['movie_info'].append(data)
        return movie_list

In [463]:
data = Data(all_links)
movie_data = data.get_data()

### Clean data

- Clear references [0], [2]
- Convert running time into an integer
- Convert datetime into python datetime object
- Split up long strings into a list
- Convert money into numbers

In [425]:
class Clean:
    def __init__(self, movie_info):
        self.movie_info = movie_info
        
    def convert_duration(self):
        for index, item in enumerate(self.movie_info['movie_info']):
            try:
                if isinstance(item['Running time'], str):
                    item['Running time'] = int(item['Running time'].split(' ')[0])
#                     print(item['Running time'])

                else:
                    avg_duration = int(np.mean([int(dur.split(' ')[0]) for dur in item['Running time']]))
                    item['Running time'] = avg_duration
#                     print(item['Running time'])
            except:
#                 item['Running time'] = None
#                 print(index, 'errorrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr')
                pass
                    
    
    def convert_datetime(self):
        return
    
    def convert_money(self):
        return        

In [426]:
clean = Clean(movie_data)
clean.convert_duration()

### Write json file

In [473]:
class Write:
    def __init__(self, movie_info):
        self.movie_info = movie_info
    
    def write_json(self):        
        with open('lionsgate-movie.json', 'w', encoding='utf-8') as f:
            json.dump(self.movie_info, f, ensure_ascii=False, indent=4)  

In [474]:
write = Write(movie_data)
write.write_json()

### Load json file

In [256]:
 class Load:
    def __init__(self, file):
        self.file = file
        
    def load_data(self):
        with open(self.file, encoding='utf-8') as f:
            return json.load(f)

In [257]:
load = Load('lionsgate-movie.json')
movie_info = load.load_data()