In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np
import json
import random
from datetime import datetime

### Get all movie links
- Get web link for movies from 2000 - 2009, and 2010 - 2019
- Get each movie link

In [2]:
class Links:
    def __init__(self, link):
#       Web scrapping initial setups
        self.link = requests.get(link)
        self.soup = bs(self.link.content)
        
    def get_main_links(self):
        ul = self.soup.find_all('ul')[1]
        a = ul.find_all('a')

        links = list()
        years = ['2000', '2010']

        for item in a:
            for year in years:
                if year in item.get('href'):
                    links.append(item.get('href'))
        
        return links
                    
          
    def get_movie_links(self):
        main_links = self.get_main_links()
        movies_links = list()

        for link in main_links:
            req = requests.get(f"https://en.wikipedia.org/{link}")
            content = bs(req.content)

            table = content.find('tbody')
            i = table.find_all('i')
            
            for item in i:
                a = item.find('a')
                if a != None:
                    movies_links.append(a.get('href'))
                else:
                    continue
            
        return movies_links

In [3]:
links = Links("https://en.wikipedia.org/wiki/Lists_of_Lionsgate_films")
all_links = links.get_movie_links()

### Get movie info
- Get each movie information and store it into a dictionary
- Remove tags

In [26]:
class Data:
    def __init__(self, urls):
        self.urls = urls
        self.base_path = "https://en.wikipedia.org/"
        
    def remove_tags(self, content):
        for tag1, tag2 in zip(content.find_all('sup', attrs={'class':'reference'}), content.find_all('span')):
            tag1.decompose()
            tag2.decompose()
        
    def get_text(self, val):
        li = val.find_all('li')
        if li:
            val_context = [i.get_text(' ', strip=True).replace("\xa0", ' ') for i in li]
            return val_context
        elif val.find('br'):
            val_context = [text for text in val.stripped_strings]
            return val_context
        else:
            val_context = val.get_text(' ', strip=True).replace('\xa0', ' ')
            return val_context
        
    def get_data(self): 
        movie_list = {'movie_info':[]}
        
        for url in self.urls:
            req = requests.get(self.base_path + url)
            content = bs(req.content)
            
            self.remove_tags(content)
            
            data = dict()
            keys = content.select('table.infobox.vevent tbody tr th')
            for index, key in enumerate(keys):
                if index == 0:
                    data['Title'] = key.get_text(' ', strip=True)
                else:
                    val = key.next_sibling
                    if val != None:
                        val_context = self.get_text(val)
                        data[key.get_text(" ", strip=True)] = val_context
                        
            movie_list['movie_info'].append(data)
        return movie_list

In [36]:
data = Data(all_links)
movie_data = data.get_data()

### Clean data

- Clear references [0], [2]
- Convert running time into an integer
- Convert datetime into python datetime object
- Split up long strings into a list
- Convert money into numbers

In [37]:
class Clean:
    def __init__(self, movie_info):
        self.movie_info = movie_info
        
    def convert_duration(self):
        duration = Duration(self.movie_info)
        return duration.get_duration()
                    
                    
    def convert_money(self):
        money = Money(self.movie_info, ['Budget', 'Box office'])
        return money.get_money()   
                    
    
    def convert_datetime(self):
        date = Date(self.movie_info)
        return date.get_date()   

In [38]:
class Duration(Clean):
    def __init__(self, movie_info):
        super().__init__(movie_info)
        
    def get_duration(self):
        for index, item in enumerate(self.movie_info['movie_info']):
            try:
                if isinstance(item['Running time'], str):
                    item['Running time'] = int(item['Running time'].split(' ')[0])

                elif isinstance(item['Running time'], list):
                    avg_duration = int(np.mean([int(dur.split(' ')[0]) for dur in item['Running time']]))
                    item['Running time'] = avg_duration
                else:
                    continue
            except:
                item['Running time'] = None

In [40]:
class Date(Clean):
    date_formats = ['%B %d, %Y', '%d %B %Y']
    
    def __init__(self, movie_info):
        super().__init__(movie_info)
        
    def date_clean(self, date):
        try:
            fixed_date = date.split('(')[0].strip()
            return fixed_date
        except:
            pass
        
    def get_string(self, date):
        if isinstance(date, list):
            date_str = self.date_clean(date[0])
            return date_str

        else:
            date_str = self.date_clean(date)
            return date_str
    
    
    def convert_date(self, date):
        try:
            converted_date = datetime.strptime(self.get_string(date), Date.date_formats[0])
            return converted_date
        except:
            try:
                converted_date = datetime.strptime(self.get_string(date), Date.date_formats[1])
                return converted_date
            except:
                return None
            
                
    def get_date(self):
        for movie in self.movie_info['movie_info']:
            try:
                if type(movie['Release date']) != datetime:
                    final_date = self.convert_date(movie.get('Release date'))
                    movie['Release date'] = final_date
                else:
                    pass
            except:
                movie['Release date'] = None
                
                

In [358]:
class Money(Clean):    
    num = r'[-+]?\d*\.\d+|\d+'
    
    def __init__(self, movie_info, keys):
        super().__init__(movie_info)
        self.keys = keys
        
        
    def get_val(self, money):
        condition1 = ('$' in money) & ('million' in money)
        condition2 = ('$' in money)
        condition3 = ('£' in money) & ('million' in money)
        
        if condition1:
            num_val = float(re.search(Money.num, money).group())
            money_final = num_val * 1000000        
            return money_final
        
        elif condition2:
            money_final = float(re.search(Money.num, money.replace(',', '')).group())
            return money_final
        
        elif condition3:
            num_val = float(re.search(Money.num, money).group())
            money_final = (num_val * 1000000 ) * 1.32       
            return money_final
        
        else:
            return None
    
    def get_money(self):
        for key in self.keys:      
            for index, movie in enumerate(self.movie_info['movie_info']):
                try:
                    if (movie[key] == None) | (isinstance(movie[key], float)):
                        pass
                    else:
                        final_val = self.get_val(movie[key])
                        movie[key] = final_val
                except:
                    movie[key] = None


In [366]:
clean = Clean(movie_data)
clean.convert_duration()
clean.convert_money()
clean.convert_datetime()

### Copy data and convert datetime into string
Reason: Because python datetime is not serializable

In [367]:
class Copy:    
    def __init__(self, data):
        self.data = data
        self.data_copy = {'movie_info':[]}
    
    def get_copy(self):
        for movie in self.data['movie_info']:
            self.data_copy['movie_info'].append(movie.copy())
            
    def initial_date(self):
        for movie in self.data_copy['movie_info']:
            date = movie.get('Release date')
            if date != None:
                fmt = '%B %d, %Y'
                movie['Release date'] = date.strftime(fmt)
            else:
                pass

In [368]:
copy = Copy(movie_data)
copy.get_copy()
copy.initial_date()
movie_data_copy = copy.data_copy

### Write and load json file

In [371]:
class Json:

    @staticmethod
    def write_json(movie_info):        
        with open('lionsgate-movie.json', 'w', encoding='utf-8') as f:
            json.dump(movie_info, f, ensure_ascii=False, indent=4)
            
    @staticmethod
    def load_data(file):
        with open(file, encoding='utf-8') as f:
            return json.load(f)

In [372]:
Json.write_json(movie_data_copy)

In [373]:
movie_json = Json.load_data('lionsgate-movie.json')

### Check the data using pandas

In [386]:
df = pd.DataFrame(movie_data['movie_info'])

In [387]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [388]:
df.head()

Unnamed: 0,Title,Directed by,Screenplay by,Based on,Produced by,Starring,Cinematography,Edited by,Music by,Production companies,Distributed by,Release date,Running time,Countries,Language,Budget,Box office,Country,Written by,Production company,Languages,Story by,Narrated by,Released,Genre,Label,Producer,Length,Recorded,French,Italian,Traditional,Simplified,Mandarin,Cantonese,Adaptation by,Spanish,Literally,Chinese,Studio
0,American Psycho,Mary Harron,"[Mary Harron, Guinevere Turner]","[American Psycho, by, Bret Easton Ellis]","[Edward R. Pressman, Chris Hanley, Christian H...","[Christian Bale, Willem Dafoe, Jared Leto, Jos...",Andrzej Sekuła,Andrew Marcus,John Cale,"[Edward R. Pressman Productions, Muse Producti...","[Lions Gate Films, (United States), Columbia T...",2000-01-21,102.0,"[United States, Canada]",English,7000000.0,34300000.0,,,,,,,,,,,,,,,,,,,,,,,
1,The Big Kahuna,John Swanbeck,Roger Rueff,"[Hospitality Suite, by Roger Rueff]","[Elie Samaha, Kevin Spacey, Andrew Stevens]","[Kevin Spacey, Danny DeVito, Peter Facinelli]",Anastas N. Michos,Peggy Davis,Christopher Young,"[Franchise Pictures, Trigger Street Productions]",Lions Gate Films,1999-09-16,90.0,,English,7000000.0,3728888.0,United States,,,,,,,,,,,,,,,,,,,,,,
2,Women,Peter Greenaway,,,Kees Kasander,"[John Standing, Matthew Delamere, Vivian Wu, S...","[Reinier van Brummelen, Sacha Vierny]",Elmer Leupen,"[Frank Loesser, Giuseppe Verdi]",,Lions Gate Films,1999-05-22,118.0,"[United Kingdom, Netherlands, Luxembourg, Germ...",,,424123.0,,Peter Greenaway,Movie Masters,"[English, Italian, Japanese, Latin]",,,,,,,,,,,,,,,,,,,
3,Jesus' Son,Alison Maclean,,"[Jesus' Son, by, Denis Johnson]","[Elizabeth Cuthrell, Lydia Dean Pilcher, David...","[Billy Crudup, Samantha Morton, Denis Leary, H...",Adam Kimmel,"[Stuart Levy, Geraldine Peroni]",Joe Henry,"[Evenstar Films, Alliance Atlantis]",Lions Gate Films,1999-09-05,107.0,"[Canada, United States]",English,2500000.0,1302067.0,,"[Elizabeth Cuthrell, David Urrutia, Oren Mover...",,,,,,,,,,,,,,,,,,,,,
4,But I'm a Cheerleader,Jamie Babbit,Brian Wayne Peterson,,"[Leanna Creel, Andrea Sperling]","[Natasha Lyonne, Clea DuVall, Cathy Moriarty, ...",Jules Labarthe,Cecily Rhett,Pat Irwin,"[Ignite Entertainment, The Kushner-Locke Company]",Lions Gate Films,1999-09-12,88.0,,English,1000000.0,2600000.0,United States,,,,Jamie Babbit,,,,,,,,,,,,,,,,,,


### Delete unnecessary columns

In [423]:
def drop_col(dataframe):
    for col in dataframe.columns:
        na_rows = dataframe[col].isna().sum()
        if na_rows > (len(df) / 2):
            dataframe.drop(col, axis=1, inplace=True)
    return dataframe

In [429]:
cleaned_df = drop_col(df)

In [430]:
cleaned_df.head()

Unnamed: 0,Title,Directed by,Produced by,Starring,Cinematography,Edited by,Music by,Production companies,Distributed by,Release date,Running time,Language,Budget,Box office,Country,Written by
0,American Psycho,Mary Harron,"[Edward R. Pressman, Chris Hanley, Christian H...","[Christian Bale, Willem Dafoe, Jared Leto, Jos...",Andrzej Sekuła,Andrew Marcus,John Cale,"[Edward R. Pressman Productions, Muse Producti...","[Lions Gate Films, (United States), Columbia T...",2000-01-21,102.0,English,7000000.0,34300000.0,,
1,The Big Kahuna,John Swanbeck,"[Elie Samaha, Kevin Spacey, Andrew Stevens]","[Kevin Spacey, Danny DeVito, Peter Facinelli]",Anastas N. Michos,Peggy Davis,Christopher Young,"[Franchise Pictures, Trigger Street Productions]",Lions Gate Films,1999-09-16,90.0,English,7000000.0,3728888.0,United States,
2,Women,Peter Greenaway,Kees Kasander,"[John Standing, Matthew Delamere, Vivian Wu, S...","[Reinier van Brummelen, Sacha Vierny]",Elmer Leupen,"[Frank Loesser, Giuseppe Verdi]",,Lions Gate Films,1999-05-22,118.0,,,424123.0,,Peter Greenaway
3,Jesus' Son,Alison Maclean,"[Elizabeth Cuthrell, Lydia Dean Pilcher, David...","[Billy Crudup, Samantha Morton, Denis Leary, H...",Adam Kimmel,"[Stuart Levy, Geraldine Peroni]",Joe Henry,"[Evenstar Films, Alliance Atlantis]",Lions Gate Films,1999-09-05,107.0,English,2500000.0,1302067.0,,"[Elizabeth Cuthrell, David Urrutia, Oren Mover..."
4,But I'm a Cheerleader,Jamie Babbit,"[Leanna Creel, Andrea Sperling]","[Natasha Lyonne, Clea DuVall, Cathy Moriarty, ...",Jules Labarthe,Cecily Rhett,Pat Irwin,"[Ignite Entertainment, The Kushner-Locke Company]",Lions Gate Films,1999-09-12,88.0,English,1000000.0,2600000.0,United States,


### Save dataframe to a csv file

In [432]:
cleaned_df.to_csv('lionsgate-movies-data.csv')