In [577]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np
import json
import random
from datetime import datetime

### Get all movie links
- Get web link for movies from 2000 - 2009, and 2010 - 2019
- Get each movie link

In [2]:
class Links:
    def __init__(self, link):
#       Web scrapping initial setups
        self.link = requests.get(link)
        self.soup = bs(self.link.content)
        
    def get_main_links(self):
        ul = self.soup.find_all('ul')[1]
        a = ul.find_all('a')

        links = list()
        years = ['2000', '2010']

        for item in a:
            for year in years:
                if year in item.get('href'):
                    links.append(item.get('href'))
        
        return links
                    
          
    def get_movie_links(self):
        main_links = self.get_main_links()
        movies_links = list()

        for link in main_links:
            req = requests.get(f"https://en.wikipedia.org/{link}")
            content = bs(req.content)

            table = content.find('tbody')
            i = table.find_all('i')
            
            for item in i:
                a = item.find('a')
                if a != None:
                    movies_links.append(a.get('href'))
                else:
                    continue
            
        return movies_links

In [3]:
links = Links("https://en.wikipedia.org/wiki/Lists_of_Lionsgate_films")
all_links = links.get_movie_links()

### Get movie info
- Get each movie information and store it into a dictionary
- Remove tags

In [376]:
class Data:
    def __init__(self, urls):
        self.urls = urls
        self.base_path = "https://en.wikipedia.org/"
        
    def remove_tags(self, content):
        for tag1, tag2 in zip(content.find_all('sup', attrs={'class':'reference'}), content.find_all('span')):
            tag1.decompose()
            tag2.decompose()
        
    def get_text(self, val):
        li = val.find_all('li')
        if li:
            val_context = [i.get_text(' ', strip=True).replace("\xa0", ' ') for i in li]
            return val_context
        elif val.find('br'):
            val_context = [text for text in val.stripped_strings]
            return val_context
        else:
            val_context = val.get_text(' ', strip=True).replace('\xa0', ' ')
            return val_context
        
    def get_data(self): 
        movie_list = {'movie_info':[]}
        
        for url in self.urls:
            req = requests.get(self.base_path + url)
            content = bs(req.content)
            
            self.remove_tags(content)
            
            data = dict()
            keys = content.select('table.infobox.vevent tbody tr th')
            for index, key in enumerate(keys):
                if index == 0:
                    data['Title'] = key.get_text()
                else:
                    val = key.next_sibling
                    if val != None:
                        val_context = self.get_text(val)
                        data[key.get_text(" ", strip=True)] = val_context
                        
            movie_list['movie_info'].append(data)
        return movie_list

In [751]:
data = Data(all_links)
movie_data = data.get_data()

### Clean data

- Clear references [0], [2]
- Convert running time into an integer
- Convert datetime into python datetime object
- Split up long strings into a list
- Convert money into numbers

In [752]:
class Clean:
    def __init__(self, movie_info):
        self.movie_info = movie_info
        
    def convert_duration(self):
        duration = Duration(self.movie_info)
        return duration.get_duration()
                    
                    
    def convert_money(self):
        money = Money(self.movie_info, ['Budget', 'Box office'])
        return money.get_money()   
                    
    
    def convert_datetime(self):
        date = Date(self.movie_info)
        return date.get_date()   

In [753]:
class Duration(Clean):
    def __init__(self, movie_info):
        super().__init__(movie_info)
        
    def get_duration(self):
        for index, item in enumerate(self.movie_info['movie_info']):
            try:
                if isinstance(item['Running time'], str):
                    item['Running time'] = int(item['Running time'].split(' ')[0])

                elif isinstance(item['Running time'], list):
                    avg_duration = int(np.mean([int(dur.split(' ')[0]) for dur in item['Running time']]))
                    item['Running time'] = avg_duration
                else:
                    continue
            except:
                item['Running time'] = None

In [754]:
class Money(Clean):    
    million = r'million\Z'
    currency = r'^\$'
    num = r'[-+]?\d*\.\d+|\d+'
    
    def __init__(self, movie_info, keys):
        super().__init__(movie_info)
        self.keys = keys
        
        
    def get_val(self, money):
        try:
            condition1 = re.search(Money.million, money)
            condition2 = re.search(Money.currency, money)

            if condition1:
                num = re.search(Money.num, money).group()
                result = round(int(num) * 1000000)
                return result

            elif condition2:
                num = re.search(Money.num, money.replace(',','')).group()
                result = round(int(num))
                return result

            elif (not condition1) & (not condition2) & (not condition3):          
                return None
        except:
            pass

    
    def get_money(self):
        for key in self.keys:      
            for index, movie in enumerate(self.movie_info['movie_info']):
                try:
                    if (type(movie[key]) == int) | (movie[key] == None):
                        continue
                    else:
                        final = self.get_val(movie[key])
                        movie[key] = final
                except:
                    movie[key] = None


In [755]:
class Date(Clean):
    date_formats = ['%B %d, %Y', '%d %B %Y']
    
    def __init__(self, movie_info):
        super().__init__(movie_info)
        
    def date_clean(self, date):
        try:
            fixed_date = date.split('(')[0].strip()
            return fixed_date
        except:
            pass
        
    def get_string(self, date):
        if isinstance(date, list):
            date_str = self.date_clean(date[0])
            return date_str

        else:
            date_str = self.date_clean(date)
            return date_str
    
    
    def convert_date(self, date):
        try:
            converted_date = datetime.strptime(self.get_string(date), Date.date_formats[0])
            return converted_date
        except:
            try:
                converted_date = datetime.strptime(self.get_string(date), Date.date_formats[1])
                return converted_date
            except:
                return None
            
                
    def get_date(self):
        for movie in self.movie_info['movie_info']:
            try:
                if type(movie['Release date']) != datetime:
                    final_date = self.convert_date(movie.get('Release date'))
                    movie['Release date'] = final_date
                else:
                    pass
            except:
                movie['Release date'] = None

In [763]:
clean = Clean(movie_data)
clean.convert_duration()
clean.convert_money()
clean.convert_datetime()

### Write and load json file

In [765]:
class Json:

    @staticmethod
    def write_json(movie_info):        
        with open('lionsgate-movie.json', 'w', encoding='utf-8') as f:
            json.dump(movie_info, f, ensure_ascii=False, indent=4)
            
    @staticmethod
    def load_data(file):
        with open(file, encoding='utf-8') as f:
            return json.load(f)

In [766]:
Json.write_json(movie_data)

TypeError: Object of type datetime is not JSON serializable

In [None]:
movie_json = Json.load_data('lionsgate-movie.json')

In [669]:
date = 'February 01, 2003'
fmt = '%B %d, %Y'

In [747]:
print(type(datetime.strptime(date, fmt)) == datetime)

True
