In [47]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [48]:
df = pd.read_csv('wiki_movie_plots_deduped.csv')
df.shape

(34886, 8)

# Preprocessing

In [49]:
df = df[df['Genre'] != 'unknown']
df.shape

(28803, 8)

In [50]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
10,1906,Dream of a Rarebit Fiend,American,Wallace McCutcheon and Edwin S. Porter,,short,https://en.wikipedia.org/wiki/Dream_of_a_Rareb...,The Rarebit Fiend gorges on Welsh rarebit at a...
11,1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_t...,The film features a train traveling through th...
12,1906,Kathleen Mavourneen,American,Edwin S. Porter,,short film,https://en.wikipedia.org/wiki/Kathleen_Mavourn...,Irish villager Kathleen is a tenant of Captain...


In [51]:
df['Genre'].nunique()

2264

In [52]:
# 1.1. dictionary-based conversion readacting Wikipedia's Film genre page:

conversion_dict = {
    "action": ["disaster", "martial arts", "spy", "superhero", "wuxia","action","masala","espionage","arts"],
    "adventure": ["pirate", "swashbuckler", "samurai"],
    "animation": ["cgi", "cutout", "live-action animated film", "stop motion", "animated", "computer-animated", "anime"],
    "comedy": ["buddy", "mockumentary", "parody", "slapstick"],
    "drama": ["docudrama", "melodrama", "biodrama", "bio-drama"],
    "historical": ["history", "historic", "alternate history", "period", "period piece", "biopic", "bio-pic", "biographical"],
    "horror": ["ghost", "monster", "vampire", "werewolf", "slash", "splatter", "zombie", "j-horror","supernatural"],
    "science fiction": ["dystopian", "dystopia", "post-apocalyptic", "steampunk", "tech noir", "utopian", "science-fiction", "scifi", "sci-fi", "space", "tokusatsu","fiction"],
    "thriller": ["mystery", "detective", "crime","suspense"],
    "musical": ["operetta"],
    "romance": ["love","romantic"],
    "western": ["cowboy"],
    "documentary": ["pseudo-documentary"],
    "fantasy":[],
    "sport":["sports","races","dance","biker"],
    "war":['ii','i'],
    "erotic":['ero','adult','erotic','sexploitation'],
    "social":['socio','costume']
}

In [53]:
import re

#preprocessing function helper for genres reduction
def genres_preprocessing(genres_dict, genre):
    stop = ['film','short']
    for w in stop:
        if w in genre:
            genre = genre.replace(w,'').strip()
        
    
    splitted = re.split("[,/]", genre)
    if len(splitted) != 1:
        genre = splitted[0]
        
    splitted = re.split("[-—–]", genre)
    if any(item in genres_dict.keys() for item in splitted):
    #if splitted[0] in genres_dict.keys():
        genre = splitted[0]
    
    splitted = re.split(" ", genre.rstrip())
    if len(splitted) != 1:
        genre = splitted[-1]
        
    for key in genres_dict.keys():
        if genre.rstrip() in genres_dict[key]:
            genre = key
        
    return genre.rstrip()

In [54]:
df['Genre2'] = df['Genre']
df['Genre2'] = df['Genre2'].apply(lambda x: genres_preprocessing(conversion_dict, x))

In [55]:
counts = df['Genre2'].value_counts()
to_remove = counts[counts < 50].index

df = df[~df.Genre2.isin(to_remove)]

drop_id = df[df.Plot.apply(lambda x: len(x) < 25)].index
df.drop(drop_id, inplace=True)

df = df[df['Genre2'] != '']

In [10]:
#g = df.groupby("Genre2")
#new_df = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

# Scraping

In [11]:
vc = df['Genre2'].value_counts()
genres_to_fetch = vc[vc < 1000].index.tolist()
genres_to_fetch

['western',
 'animation',
 'science fiction',
 'adventure',
 'musical',
 'war',
 'noir',
 'family',
 'fantasy',
 'historical',
 'biography',
 'social',
 'documentary',
 'serial',
 'sport']

In [12]:
#LINKS GETTER FUNCTIONS
import requests
from bs4 import BeautifulSoup

def get_links(src, stw="/", genre="", listof=False):
    links = []
    soup = BeautifulSoup(src)
    if not listof:
        for link in soup.find_all('a'):
            href = link.get('href')
            if href and href.startswith(stw) and genre.replace(" ","_") in href.lower():
                links.append(href)
                
    else:
        for link in soup.find_all('i'):
            if link.find('a'):
                href = link.find('a').get('href')
                if href and href.startswith(stw) and genre.replace(" ","_") in href.lower():
                    links.append(href)
        
            
    return links

def request_links(base, in_links, listof=False):
    out_links = []
    if type(in_links) == list:
        response = [ requests.get(base+link).text for link in set(in_links) ]
        for res in response:
            out_links.extend(get_links(res,listof=listof))

    else:
        res = requests.get(base+in_links).text
        out_links.extend(get_links(res,listof=listof))
        
    return out_links

In [13]:
#FETCHING FUNCTIONS
def fetching_title_plot(soup):
    if soup.find("span", {"id":"Plot"}):
        title = soup.find("h1", {"id":"firstHeading"}).text
        plot_span = soup.find("span", {"id":"Plot"})
        obj = plot_span.find_parent()
        plot = ""
        while True:
            obj = obj.next_sibling
            if obj.name not in ['p','h2']:
                continue
            elif obj.name == 'p':
                plot += obj.text.strip('\n')
            elif obj.name == 'h2':
                break

        return title, plot

    else:
        return None, None
        

def fetching_cast(soup):
    if soup.find('th', string='Starring'):
        starring = soup.find('th', string='Starring')
        starring_sib = starring.nextSibling
        cast = [star.get('title') for star in starring_sib.find_all('a')]

    else:
        cast = []

    return cast

def fetching_director_date(soup):
    if soup.find("th", string='Directed by'):
        director = soup.find("th", string='Directed by')
        director = director.nextSibling.text
        
    else:
        director = None

    if soup.find("th", string='Release date'):
        date = soup.find("th", string='Release date')
        date = date.nextSibling.text
        date = re.findall(r"\d{4}",date)[0]
    
    else:
        date = None

    return director, date


#FINAL FETCHING FUNCTION
def fetching_film_info(link):
    html = requests.get(link).text
    soup = BeautifulSoup(html)
    title, plot = fetching_title_plot(soup)
    director, date = fetching_director_date(soup)
    cast = fetching_cast(soup)
    film_info = {'Release Year': date, 'Title':title, 'Director':director, 'Cast': cast, 'Plot':plot} if plot is not None else None
    return film_info


In [14]:
#SCRAPING FUNCTIONS
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from functools import partial

#scraping function
def plot_scraper(film_list, genre, link):
    f_link = "https://en.wikipedia.org"+link
    film_info = fetching_film_info(f_link)
    if film_info is not None:
        film_info.update({'Genre':genre})
        film_list.append(film_info)
    
#multithreaded scraping with iqdm
def set_up_threads(links, film_list, genre):
    l = len(links)
    with tqdm(total=l) as pbar:
        with ThreadPoolExecutor(max_workers=8) as executor:
            futures = { executor.submit(plot_scraper, film_list, genre, link): link for link in links }
            for future in as_completed(futures):
                pbar.update(1)

#old multithreaded scraping function
#def set_up_threads(links, film_list, genre):
#    with ThreadPoolExecutor(max_workers=6) as executor:
#        return executor.map( partial(plot_scraper, film_list, genre),links,timeout=30 )    
            
    

#test cell for film scraping with single genre
base = "https://en.wikipedia.org"
genres_link = "https://en.wikipedia.org/wiki/Template:Films_by_genre_sidebar"
main_res = requests.get(genres_link).text
genre_links = get_links(main_res, "/wiki/List", 'western')
genre_sublinks = request_links(base, genre_links)

films = []
set_up_threads( genre_sublinks[:50], films, 'western' )
films

In [15]:
from tqdm.notebook import tqdm_notebook as tqdm
#main scraping cycle
base = "https://en.wikipedia.org"
genres_link = "https://en.wikipedia.org/wiki/Template:Films_by_genre_sidebar"
main_res = requests.get(genres_link).text

#final films list to populate with new scraped films
films_list = []

for genre in genres_to_fetch:
    print(genre.upper())
    genre_links = get_links(main_res, "/wiki/List", genre)
    for link in tqdm(genre_links):
        print("Fetching films -->")
        genre_sublinks = set(request_links(base, link))
        
        if "Lists" in link:
            final_genre_sublinks = []
            
            print("Fetching films from sublists for ", genre.upper())
            for fl in tqdm(genre_sublinks):
                genre_to_match = genre.title() if genre == 'western' else genre.replace(" ", "_")
                genre_to_match = "List_of_"+genre_to_match+"_films"
                if genre_to_match in fl:
                    real_fl = request_links(base, fl, listof=True)
                    final_genre_sublinks.extend(real_fl)
                    
            print("Scraping -->\n")
            set_up_threads(set(final_genre_sublinks), films_list, genre)
        else:
            print("Scraping -->\n")
            set_up_threads(genre_sublinks, films_list, genre)
            

WESTERN


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Fetching films -->
Fetching films from sublists for  WESTERN


HBox(children=(FloatProgress(value=0.0, max=174.0), HTML(value='')))


Scraping -->



HBox(children=(FloatProgress(value=0.0, max=3423.0), HTML(value='')))



ANIMATION


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


SCIENCE FICTION


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Fetching films -->
Fetching films from sublists for  SCIENCE FICTION


HBox(children=(FloatProgress(value=0.0, max=419.0), HTML(value='')))


Scraping -->



HBox(children=(FloatProgress(value=0.0, max=2180.0), HTML(value='')))


Fetching films -->
Scraping -->



HBox(children=(FloatProgress(value=0.0, max=1291.0), HTML(value='')))



ADVENTURE


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Fetching films -->
Fetching films from sublists for  ADVENTURE


HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))


Scraping -->



HBox(children=(FloatProgress(value=0.0, max=3974.0), HTML(value='')))



MUSICAL


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Fetching films -->
Scraping -->



HBox(children=(FloatProgress(value=0.0, max=2132.0), HTML(value='')))



WAR


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Fetching films -->
Scraping -->



HBox(children=(FloatProgress(value=0.0, max=174.0), HTML(value='')))


Fetching films -->
Scraping -->



HBox(children=(FloatProgress(value=0.0, max=856.0), HTML(value='')))



NOIR


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Fetching films -->
Scraping -->



HBox(children=(FloatProgress(value=0.0, max=1103.0), HTML(value='')))


Fetching films -->
Scraping -->



HBox(children=(FloatProgress(value=0.0, max=1429.0), HTML(value='')))



FAMILY


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


FANTASY


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Fetching films -->
Fetching films from sublists for  FANTASY


HBox(children=(FloatProgress(value=0.0, max=370.0), HTML(value='')))


Scraping -->



HBox(children=(FloatProgress(value=0.0, max=1069.0), HTML(value='')))



HISTORICAL


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


BIOGRAPHY


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


SOCIAL


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


DOCUMENTARY


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Fetching films -->
Scraping -->



HBox(children=(FloatProgress(value=0.0, max=4175.0), HTML(value='')))



SERIAL


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


SPORT


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Fetching films -->
Scraping -->



HBox(children=(FloatProgress(value=0.0, max=1419.0), HTML(value='')))





In [56]:
df_scraped = pd.DataFrame(films_list)
df_scraped.to_csv('scraped_movies.csv',index=False)

In [57]:
df = df.drop(['Genre','Wiki Page', 'Origin/Ethnicity'], axis=1).rename({'Genre2':'Genre'},axis=1)
df = pd.concat([df,df_scraped], ignore_index=True)
df.to_csv('movie_dataset_classification.csv')