In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [2]:
df = pd.read_csv('wiki_movie_plots_deduped.csv')
df.shape

(34886, 8)

# Preprocessing

In [3]:
df = df[df['Genre'] != 'unknown']
df.shape

(28803, 8)

In [4]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
10,1906,Dream of a Rarebit Fiend,American,Wallace McCutcheon and Edwin S. Porter,,short,https://en.wikipedia.org/wiki/Dream_of_a_Rareb...,The Rarebit Fiend gorges on Welsh rarebit at a...
11,1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_t...,The film features a train traveling through th...
12,1906,Kathleen Mavourneen,American,Edwin S. Porter,,short film,https://en.wikipedia.org/wiki/Kathleen_Mavourn...,Irish villager Kathleen is a tenant of Captain...


In [5]:
df['Genre'].nunique()

2264

In [6]:
# 1.1. dictionary-based conversion readacting Wikipedia's Film genre page:

conversion_dict = {
    "action": ["disaster", "martial arts", "spy", "superhero", "wuxia","action","masala","espionage","arts"],
    "adventure": ["pirate", "swashbuckler", "samurai"],
    "animation": ["cgi", "cutout", "live-action animated film", "stop motion", "animated", "computer-animated", "anime"],
    "comedy": ["buddy", "mockumentary", "parody", "slapstick"],
    "drama": ["docudrama", "melodrama", "biodrama", "bio-drama"],
    "historical": ["history", "historic", "alternate history", "period", "period piece", "biopic", "bio-pic", "biographical"],
    "horror": ["ghost", "monster", "vampire", "werewolf", "slash", "splatter", "zombie", "j-horror","supernatural"],
    "science fiction": ["dystopian", "dystopia", "post-apocalyptic", "steampunk", "tech noir", "utopian", "science-fiction", "scifi", "sci-fi", "space", "tokusatsu","fiction"],
    "thriller": ["mystery", "detective", "crime","suspense"],
    "musical": ["operetta"],
    "romance": ["love","romantic"],
    "western": ["cowboy"],
    "documentary": ["pseudo-documentary"],
    "fantasy":[],
    "sport":["sports","races","dance","biker"],
    "war":['ii','i'],
    "erotic":['ero','adult','erotic','sexploitation'],
    "social":['socio','costume']
}

In [7]:
import re

#preprocessing function helper for genres reduction
def genres_preprocessing(genres_dict, genre):
    stop = ['film','short']
    for w in stop:
        if w in genre:
            genre = genre.replace(w,'').strip()
        
    
    splitted = re.split("[,/]", genre)
    if len(splitted) != 1:
        genre = splitted[0]
        
    splitted = re.split("[-—–]", genre)
    if any(item in genres_dict.keys() for item in splitted):
    #if splitted[0] in genres_dict.keys():
        genre = splitted[0]
    
    splitted = re.split(" ", genre.rstrip())
    if len(splitted) != 1:
        genre = splitted[-1]
        
    for key in genres_dict.keys():
        if genre.rstrip() in genres_dict[key]:
            genre = key
        
    return genre.rstrip()

In [8]:
df['Genre2'] = df['Genre']
df['Genre2'] = df['Genre2'].apply(lambda x: genres_preprocessing(conversion_dict, x))

In [9]:
counts = df['Genre2'].value_counts()
to_remove = counts[counts < 50].index

df = df[~df.Genre2.isin(to_remove)]

drop_id = df[df.Plot.apply(lambda x: len(x) < 25)].index
df.drop(drop_id, inplace=True)

df = df[df['Genre2'] != '']

In [10]:
#g = df.groupby("Genre2")
#new_df = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

# Scraping

In [11]:
vc = df['Genre2'].value_counts()
genres_to_fetch = vc[vc < 1000].index.tolist()
genres_to_fetch

['western',
 'animation',
 'science fiction',
 'adventure',
 'musical',
 'war',
 'noir',
 'family',
 'fantasy',
 'historical',
 'biography',
 'social',
 'documentary',
 'serial',
 'sport']

In [12]:
#LINKS GETTER FUNCTIONS
import requests
from bs4 import BeautifulSoup

def get_links(src, stw="/", genre="", listof=False):
    links = []
    soup = BeautifulSoup(src)
    if not listof:
        for link in soup.find_all('a'):
            href = link.get('href')
            if href and href.startswith(stw) and genre.replace(" ","_") in href.lower():
                links.append(href)
                
    else:
        for link in soup.find_all('b'):
            if link.find('a'):
                href = link.find('a').get('href')
                if href and href.startswith(stw) and genre.replace(" ","_") in href.lower():
                    links.append(href)
        
            
    return links

def request_links(base, in_links, listof=False):
    out_links = []
    if type(in_links) == list:
        response = [ requests.get(base+link).text for link in set(in_links) ]
        for res in response:
            out_links.extend(get_links(res,listof=listof))

    else:
        res = requests.get(base+in_links).text
        out_links.extend(get_links(res,listof=listof))
        
    return out_links

In [13]:
#FETCHING FUNCTIONS
def fetching_title_plot(soup):
    try:
        if soup.find("span", {"id":"Plot"}):
            title = soup.find("h1", {"id":"firstHeading"}).text
            print(title)
            plot_span = soup.find("span", {"id":"Plot"})
            obj = plot_span.find_parent()
            plot = ""
            while True:
                obj = obj.next_sibling
                if obj.name not in ['p','h2']:
                    continue
                elif obj.name == 'p':
                    plot += obj.text.strip('\n')
                elif obj.name == 'h2':
                    break

            return title, plot

        else:
            return None, None
        
    except:
        return None, None

def fetching_cast(soup):
    try:
        starring = soup.find('th', string='Starring')
        if starring:
            starring_sib = starring.nextSibling
            cast = [star.get('title') for star in starring_sib.find_all('a')]
            
        else:
            cast = []
            
        return cast
    except:
        return []

def fetching_director_date(soup):
    director = soup.find("th", string='Directed by')
    date = soup.find("th", string='Release date')
    if director:
        director = director.nextSibling.text
        
    if date:
        date = date.nextSibling.text
        date = re.findall(r"\d{4}",date_sib)[0]

        
        
    return director, date


#FINAL FETCHING FUNCTION
def fetching_film_info(link):
    html = requests.get(link).text
    soup = BeautifulSoup(html)
    title, plot = fetching_title_plot(soup)
    director, date = fetching_director_date(soup)
    cast = fetching_cast(soup)
    film_info = {'Release Year': date, 'Title':title, 'Director':director, 'Cast': cast, 'Plot':plot} if plot is not None else None
    return film_info


In [14]:
#SCRAPING FUNCTIONS
import time
from concurrent.futures import ThreadPoolExecutor
import threading
from functools import partial

#scraping function
def plot_scraper(film_list, genre, link):
    f_link = "https://en.wikipedia.org"+link
    film_info = fetching_film_info(f_link)
    if film_info is not None:
        film_info.update({'Genre':genre})
        film_list.append(film_info)
    
#parallel scraping
def set_up_threads(links, film_list, genre):
    with ThreadPoolExecutor(max_workers=6) as executor:
        return executor.map( partial(plot_scraper, film_list, genre),
                             links,
                             timeout=30 )    
            

In [30]:
#test cell for film scraping with single genre
base = "https://en.wikipedia.org"
genres_link = "https://en.wikipedia.org/wiki/Template:Films_by_genre_sidebar"
main_res = requests.get(genres_link).text
genre_links = get_links(main_res, "/wiki/List", 'western')
genre_sublinks = request_links(base, genre_links)

films = []
set_up_threads( genre_sublinks[:100], films, 'western' )
films

[]

In [None]:
#main scraping cycle
base = "https://en.wikipedia.org"
genres_link = "https://en.wikipedia.org/wiki/Template:Films_by_genre_sidebar"
main_res = requests.get(genres_link).text

#final films list to populate with new scraped films
films_list = []

for genre in genres_to_fetch:
    genre_links = get_links(main_res, "/wiki/List", genre)
    for link in genre_links:
        genre_sublinks = request_links(base, link)
        if "Lists" in link:
            final_genre_sublinks = []
            print(genre.upper())
            for fl in set(genre_sublinks):
                genre_to_match = genre.title() if genre == 'western' else genre.replace(" ", "_")
                genre_to_match = "List_of_"+genre_to_match+"_films"
                if genre_to_match in fl:
                    real_fl = request_links(base, fl, listof=True)
                    final_genre_sublinks.extend(real_fl)
            set_up_threads(final_genre_sublinks, films_list, genre)
        else:
            print(genre.upper())
            set_up_threads(genre_sublinks, films_list, genre)
            

WESTERN
Bullet Proof (1920 film)
Just Pals
Lone Hand Wilson
The Last of the Mohicans (1920 American film)
The Mark of Zorro (1920 film)
Overland Red
Robbery Under Arms (1920 film)
Ruth of the Rockies
Sundown Slim
Thunderbolt Jack
The Toll Gate
Vanishing Trails
The Big Punch
Desperate Trails (1921 film)
Finders Keepers (1921 film)
The Fire Eater
The Fox (1921 film)
Hearts Up
The Freeze-Out
Red Courage
Tangled Trails
Sure Fire
Three Word Brand
The Wallop
The Bearcat
Headin' West
The Loaded Door
The Lone Hand (1922 film)
Man to Man (1922 film)
Moonshine Valley
The Paleface (1922 film)
Step on It! (film)
Sky High (1922 film)
Canyon of the Fools
The Call of the Canyon
The Covered Wagon
The Ghost City
Kindled Courage
The Kickback (film)
Ridin' Wild (1922 film)
Ruggles of Red Gap (1923 film)
When the Kellys Were Out
Wild Bill Hickok (film)
Ace of Cactus Range
Hook and Ladder (1924 film)
The Iron Horse (film)
The Mine with the Iron Door (1924 film)
A Pair of Hellions
North of 36
The Trouble Sh

West of Tombstone
Western Mail (film)
Wild Bill Hickok Rides
Bad Men of Thunder Gap
Black Market Rustlers
Bar 20
Blazing Guns
Boss of Rawhide
Bullets and Saddles
Border Patrol (film)
Buckskin Frontier
Cheyenne Roundup
Cattle Stampede
Colt Comrades
Cowboy Commandos
Cowboy in the Clouds
The Desperadoes
Death Rides the Plains
Devil Riders
Fighting Frontier
Fighting Valley
Frontier Law
The Ghost Rider (1943 film)
Hail to the Rangers
Hoppy Serves a Writ
Girl Crazy (1943 film)
Idaho (1943 film)
The Kansan (film)
In Old Oklahoma
King of the Cowboys
Land of Hunted Men
A Lady Takes a Chance
Law of the Saddle
The Lone Star Trail
My Friend Flicka (film)
Outlaws of Stampede Pass
The Outlaw
Raiders of San Joaquin
Raiders of Red Gap
Raiders of Sunset Pass
The Ox-Bow Incident
Riders of the Deadline
Riders of the Northwest Mounted
Robin Hood of the Range
Silver City Raiders
Six Gun Gospel
Song of Texas
The Stranger from Pecos
West of Texas
Western Cyclone
Trail of Terror
Wild Horse Stampede
Wolves of 

Red Stallion in the Rockies
Renegades of the Sage
The Red Pony (1949 film)
Riders of the Dusk
Riders of the Range (1949 film)
Riders of the Whistling Pines
Rimfire (film)
Roughshod (1949 film)
Rustlers (1949 film)
San Antone Ambush
Satan's Cradle
Shadows of the West
She Wore a Yellow Ribbon
Sheriff of Wichita
Son of a Bad Man
Son of Billy the Kid
Sons of New Mexico
South of Rio
South of St. Louis
Stallion Canyon
Stampede (1949 film)
Streets of Laredo (film)
Susanna Pass
The Walking Hills
Trail of the Yukon
Western Renegades
West of El Dorado
The Wyoming Bandit
Apache Ambush
The Americano (1955 film)
Apache Woman
At Gunpoint
Canyon Crossroads
Bad Day at Black Rock
Chief Crazy Horse (film)
Count Three and Pray (film)
The Coyote (1955 film)
The Far Country
Fort Yuma (film)
Foxfire (1955 film)
Five Guns West
Kentucky Rifle (film)
The Kentuckian (1955 film)
The Gun That Won the West
The Indian Fighter
The Last Frontier (1955 film)
A Lawless Street
Last of the Desperados
The Lonesome Trail (

Carry On Cowboy
Degueyo
Django (1966 film)
A Big Hand for the Little Lady
Massacre Time
Duel at Diablo
El Dorado (1966 film)
The Good, the Bad and the Ugly
Sharp-Shooting Twin Sisters
Sharp-Shooting Twin Sisters
An Eye for an Eye (1966 film)
Johnny Reno
Kid Rodelo
Navajo Joe
Nevada Smith
The Night of the Grizzly
Gunpoint (film)
Incident at Phantom Hill
Return of the Seven
The Professionals (1966 film)
The Rare Breed
Seven Dollars on the Red
Ride in the Whirlwind
The Shooting
The Sons of Great Bear
Jesse James Meets Frankenstein's Daughter
Ride Beyond Vengeance
Stagecoach (1966 film)
The Plainsman (1966 film)
The Texican
Two Sons of Ringo
3 colpi di Winchester per Ringo
Texas, Adios
Per il gusto di uccidere
Waco (1966 film)
The Ugly Ones
Texas Across the River
The Trap (1966 film)
Yankee (film)
The Bandits (film)
Bang Bang Kid
The Ballad of Josie
The Adventures of Bullwhip Griffin
A Bullet for the General
Cjamango
Chuka (film)
Custer of the West
Day of Anger
Death Rides a Horse
Django K

The Milagro Beanfield War
War Party (1988 film)
The Tracker (1988 film)
Stranger on My Land
Ghost Town (1988 film)
Young Guns (film)
Sunset (1988 film)
Old Gringo
Lonesome Dove (miniseries)
Back to the Future Part III
Dances with Wolves
El Diablo (1990 film)
Montana (1990 film)
Quigley Down Under
Black Robe (film)
City Slickers
Young Guns II
Conagher
An American Tail: Fievel Goes West
Into the Badlands (film)
The Gambler (film series)
My Heroes Have Always Been Cowboys (film)
Four Eyes and Six Guns
Son of the Morning Star (film)
The Last of the Mohicans (1992 film)
Keep the Change (film)
Mad at the Moon
El Mariachi
Thousand Pieces of Gold (film)
Pure Country
Bonanza: The Return
Thunderheart
Unforgiven
Gunsmoke: The Long Ride
Geronimo: An American Legend
The Last Outlaw (1993 film)
The Ballad of Little Jo
Far and Away
Return to Lonesome Dove
Sommersby
Samurai Cowboy
Tombstone (film)
Jonathan of the Bears
Posse (1993 film)
Bad Girls (1994 film)
8 Seconds
Blind Justice (1994 film)
F.T.W. 

Westward Ho (1935 film)
The Desert Trail
Empty Saddles
Ghost Patrol
Hopalong Cassidy Returns
Der Kaiser von Kalifornien
The Last of the Mohicans (1936 film)
The Lawless Nineties
The Lonely Trail
Red River Valley (1936 film)
Phantom Patrol
The Plainsman
Rhythm on the Range
Stampede (1936 film)
The Texas Rangers (1936 film)
Wildcat Trooper
Three Godfathers (1936 film)
The Three Mesquiteers (film)
Winds of the Wasteland
Black Aces (film)
Born to the West
Headin' East
Hittin' the Trail
Hit the Saddle
The Painted Stallion
Riders of the Whistling Skull
Sing, Cowboy, Sing
Tex Rides with the Boy Scouts
Texas Trail (1937 film)
Wells Fargo (film)
Wild and Woolly (1937 film)
Way Out West (1937 film)
Billy the Kid Returns
California Frontier
Zorro Rides Again
Cassidy of Bar 20
Cattle Raiders
Come On, Rangers
Cowboy from Brooklyn
Heart of the North
The Cowboy and the Lady (1938 film)
Law of the Texan
The Mexicali Kid
The Mysterious Rider (1938 film)
The Overland Express
Rawhide (1938 film)
Rollin' 

Lightning Raiders (film)
Marked for Murder
Marshal of Laredo
The Man from Oklahoma
Northwest Trail
Oregon Trail (film)
Outlaws of the Rockies
Phantom of the Plains
The Return of the Durango Kid
Rhythm Round-Up
Prairie Rustlers
Rough Riders of Cheyenne
Rockin' in the Rockies
Rough Ridin' Justice
Santa Fe Saddlemates
Salome, Where She Danced
San Antonio (film)
Sheriff of Cimarron
Song of the Prairie
Stagecoach Outlaws
Texas Panhandle (film)
Three in the Saddle
The Topeka Terror
Trail of Kit Carson
Wagon Wheels Westward
West of the Pecos (1945 film)
Wanderer of the Wasteland (1945 film)
Abilene Town
Alias Billy the Kid
Bad Bascomb (film)
Beauty and the Bandit
Badman's Territory
The Caravan Trail
Conquest of Cheyenne
Cowboy Blues
Days of Buffalo Bill
Canyon Passage
The Desert Horseman
The Devil's Playground (1946 film)
The El Paso Kid
The Fighting Frontiersman
Frontier Gunlaw
The Gay Cavalier (film)
Duel in the Sun (film)
The Gentleman from Texas
Gentlemen with Guns
God's Country (1946 fil

Perils of the Wilderness
The Proud Ones
Pillars of the Sky
Raw Edge
Quincannon, Frontier Scout
Red Sundown
Reprisal!
Secret of Treasure Mountain
Star in the Dust
Tension at Table Rock
Seven Men from Now
Tribute to a Bad Man
Walk the Proud Land
The Searchers
Westward Ho the Wagons!
Apache Warrior
The Badge of Marshal Brennan
The Big Land
Black Patch (film)
The Buckskin Lady
Campbell's Kingdom
Domino Kid
The Dalton Girls
Dragoon Wells MassacreDecision at Sundown

Duel at Apache Wells
Drango
Escape from Red Rock
Forty Guns
Gun Battle at Monterey
Fury at Showdown
Gun for a Coward
Gun Duel in Durango
Gun Glory
Gunfire at Indian Gap
Gunsight Ridge
The Hard Man
The Halliday Brand
Gunfight at the O.K. Corral (film)
The Guns of Fort Petticoat
Hell Canyon Outlaws
Hell's Crossroads
The Last Stagecoach West
The Lonely Man
Naked in the Sun
The Hired Gun (1957 film)
Joe Dakota (1957 film)
Last of the Badmen
The Iron Sheriff
Night Passage (film)
Oregon Passage
The Oklahoman (film)
Old Yeller (film)
M

Sam Whiskey
Smith!
The Specialists (film)
Support Your Local Sheriff!
Sundance and the Kid
A Time for Dying
Tell Them Willie Boy Is Here
True Grit (1969 film)
The Undefeated (1969 film)
The Valley of Gwangi
Young Billy Young
The Avenger, Zorro
The Wild Bunch
Adiós, Sabata
And God Said to Cain
Sartana in the Valley of Death
The Ballad of Cable Hogue
Barquero
Cannon for Cordoba
The Unholy Four (1970 film)
Chisum
Compañeros (film)
El Condor (film)
Deadlock (1970 film)
Dirty Dingus Magee
Django and Sartana Are Coming... It's the End
The Cheyenne Social Club
Have a Good Funeral, My Friend... Sartana Will Pay
Four Rode Out
Hey Amigo! A Toast to Your Death
Sartana's Here… Trade Your Pistol for a Coffin
Kill Django... Kill First
The Intruders (1970 film)
Land Raiders (film)
Light the Fuse... Sartana Is Coming
A Man Called Sledge
The McMasters
A Man Called Horse (film)
Monte Walsh (1970 film)
Little Big Man (film)
The Phantom Gunslinger
Ned Kelly (1970 film)
The Over-the-Hill Gang Rides Again
R

Cowboy Up
Crossfire Trail
Dust (2001 film)
Der Schuh des Manitu
Texas Rangers (film)
800 Bullets
King of Texas
Legend of the Phantom Rider
Gang of Roses
And Starring Pancho Villa as Himself
Spirit: Stallion of the Cimarron
Monte Walsh (2003 film)
The Missing (2003 film)
Nate and the Colonel
The Last Samurai
Open Range (2003 film)
Once Upon a Time in Mexico
The Alamo (2004 film)
Blueberry (film)
Dead Birds (2004 film)
Les Dalton (film)
Tremors 4: The Legend Begins
Home on the Range (2004 film)
Don't Come Knocking
The Proposition (2005 film)
Brokeback Mountain
Serenity (2005 film)
The Three Burials of Melquiades Estrada
The Legend of Zorro
After Sundown (2006 film)
Bandidas
Broken Trail
Desolation Canyon (film)
The Quick and the Undead
The Far Side of Jericho
Dynamite Warrior
A Shot in the West
Summer Love (2006 film)
Seraphim Falls
All Hat
3:10 to Yuma (2007 film)
Avenging Angel (2007 film)
BloodRayne 2: Deliverance
Bury My Heart at Wounded Knee (film)
The Assassination of Jesse James b

Leviathan (1989 film)
Let Sleeping Corpses Lie (film)
Life (2017 film)
Ladrón de Cadáveres
Lifeforce (film)
The Mad Monster
Lily C.A.T.
Man-Made Monster
The Man Who Changed His Mind
The Man They Could Not Hang
Life Blood (film)
Lady Frankenstein
Meatball Machine
Maximum Overdrive
Mimic 2
Mimic (film)
The Manster
Mimic 3: Sentinel
Mindwarp (film)
Monster (1980 film)
The Mist (film)
The Mad Ghoul
Monsters (2010 film)
Monsters: Dark Continent
The Monolith Monsters
Monster from Green Hell
Monster from the Ocean Floor
Monstrosity (film)
Moon (2009 film)
Mutant Chronicles (film)
Mesa of Lost Women
Monster a Go-Go
The Neanderthal Man
Nightbeast
Moon Child (2003 film)
Night of the Big Heat (film)
Night of the Blood Beast
Mutants (2009 film)
Narcopolis (film)
Not of This Earth (1988 film)
The Mysterians
The Omega Man
Outland (film)
Outpost (2008 film)
Outpost: Black Sun
Overlord (2018 film)
Pandorum
Night of the Creeps
Parasite Eve (film)
Not of This Earth (1957 film)
Night of the Lepus
Parasyt

Captain Blood (1935 film)
China Seas (film)
Clive of India (film)
The Crusades (film)
La Bandera (film)
The Call of the Savage
Call of the Wild (1935 film)
The Lost City (1935 serial)
The Lives of a Bengal Lancer (film)
Mutiny on the Bounty (1935 film)
The Last Outpost (1935 film)
The Live Wire (1935 film)
... nur ein Komödiant
The New Adventures of Tarzan
Oil for the Lamps of China (film)
Professional Soldier
La Route impériale
Queen of the Jungle
Sanders of the River
She (1935 film)
The Three Musketeers (1935 film)
Under the Pampas Moon
Savage Fury
Storm Over the Andes
Captain Calamity (film)
The Bold Caballero
Dangerous Waters (1936 film)
Isle of Fury
Down to the Sea
The Jungle Princess
Darkest Africa
The Last of the Mohicans (1936 film)
The Charge of the Light Brigade (1936 film)
A Legionnaire
The Leathernecks Have Landed
Revolt of the Zombies
Rhodes of Africa
Song of Freedom
Under Two Flags (1936 film)
Undersea Kingdom
Sea Spoilers
White Hunter (film)
Robinson Crusoe of Clipper Is

Savage Drums
Soldiers Three (film)
Sirocco (film)
Smuggler's Gold (film)
Superman and the Mole Men
The Sea Hornet
Smuggler's Island
Ten Tall Men
Tales of Robin Hood
Where No Vultures Fly
Tarzan's Peril
The Sword of Monte Cristo
Abbott and Costello Meet Captain Kidd
Against All Flags
Arctic Flight
Aladdin and His Lamp
At Sword's Point
The Big Sky (film)
The Blazing Forest
Bomba and the Jungle Girl
Blackbeard the Pirate
The Brigand (film)
African Treasure
Captain Pirate
California Conquest
Caribbean Gold
Desperate Search
The Crimson Pirate
The Fighter (1952 film)
The Golden Coach
Hong Kong (film)
La figlia del diavolo
Bwana Devil
Ivanhoe (1952 film)
Voodoo Tiger
King of the Congo
Lady in the Iron Mask
Lost in Alaska
Hurricane Smith (1952 film)
The Golden Hawk
Lure of the Wilderness
Last Train from Bombay
Lydia Bailey
Manina, the Girl in the Bikini
Macao (film)
Kaadu (1952 film)
The Pathfinder (1952 film)
Mutiny (1952 film)
The Prisoner of Zenda (1952 film)
The Planter's Wife (1952 film)


The Last Days of Pompeii (1959 film)
Knight Without a Country
The Little Savage
Marie of the Isles
North by Northwest
North West Frontier (film)
John Paul Jones (film)
Pier 5, Havana
Prisoner of the Volga
The Pirate and the Slave Girl
The Scavengers (1959 film)
The Stranglers of Bombay
Tarzan, the Ape Man (1959 film)
Tarzan's Greatest Adventure
The Tiger of Eschnapur (1959 film)
Timbuktu (1959 film)
Tunis Top Secret
The Wreck of the Mary Deare (film)
The 3 Worlds of Gulliver
12 to the Moon
L'Avventura
The Dam on the Yellow River
The Cossacks (1960 film)
The Boy and the Pirates
Dinosaurus!
The Huns (film)
Horrors of Spider Island
Knight of 100 Faces
The Last Voyage
Letter Never Sent (film)
Kidnapped (1960 film)
David and Goliath (1960 film)
Mistress of the World
Moment of Danger
Morgan, the Pirate
The Lost World (1960 film)
North to Alaska
Queen of the Pirates
Revak the Rebel
Macumba Love
Robin Hood and the Pirates
Passport to China
The Secret of the Purple Reef
Seven in the Sun
Septemb

The Syndicate (1968 film)
Ragan (film)
A Twist of Sand
Thunderbird 6
Villa Rides
Where Eagles Dare
Will Our Heroes Be Able to Find Their Friend Who Has Mysteriously Disappeared in Africa?
Tarzan and the Jungle Boy
The Two Crusaders
Witchfinder General (film)
100 Rifles
Age of Consent (film)
Alfred the Great (film)
2000 Years Later
Burn! (1969 film)
Yellow Submarine (film)
Butch Cassidy and the Sundance Kid
The Assassination Bureau
Easy Rider
The Extraordinary Seaman
The Bushbaby
Captain Nemo and the Underwater City
Che!
The Five Man Army
Franco, Ciccio e il pirata Barbanera
Hannibal Brooks
Justine (1969 film)
Impasse (film)
Kenner (film)
Krakatoa, East of Java
Latitude Zero (film)
Mackenna's Gold
Monte Carlo or Bust!
Lost in the Desert
On Her Majesty's Secret Service (film)
The Red Tent (film)
Play Dirty
Sam Whiskey
The Seven Red Berets
Samurai Banners
Ms. Stiletto
The Royal Hunt of the Sun (film)
Shark!
Sinful Davey
Target: Harry
Tarzana, the Wild Girl
Tintin and the Temple of the Sun

The Shark Hunter
The Blue Lagoon (1980 film)
The Awakening (1980 film)
Caboblanco
The Blues Brothers (film)
Cannibal Holocaust
The Courage of Kavik the Wolf Dog
The Curse of King Tut's Tomb (1980 film)
The Dogs of War (film)
Cloud Dancer
The Earthling
Cuba Crossing
Escape from Hell (1980 film)
Battle Beyond the Stars
Flatfoot in Egypt
The Gods Must Be Crazy
Flash Gordon (film)
Herbie Goes Bananas
The Empire Strikes Back
The Island (1980 film)
I'm Getting a Yacht
Kagemusha
Hawk the Slayer
A Tale of Africa
The Mountain Men
Popeye (film)
Pirates of the 20th Century
Raise the Titanic (film)
The Sea Wolves
Skyward (film)
Superman II
The Last Flight of Noah's Ark
Death Hunt
Clash of the Titans (1981 film)
Mondo Cannibale
Escape from New York
Excalibur (film)
For Your Eyes Only (film)
Goliath Awaits
Gallipoli (1981 film)
The Fox and the Hound
La Chèvre
Great White (1981 film)
Green Ice
Heavy Metal (film)
High Risk (1981 film)
Lion of the Desert
Masada (miniseries)
Dragonslayer (1981 film)
Rac

Pocahontas (1995 film)
Sahara (1995 film)
The City of Lost Children
Waterworld
101 Dalmatians (1996 film)
Alaska (1996 film)
Aladdin and the King of Thieves
Toy Story
Napoleon (1995 film)
Tom and Huck
Dragonheart
The English Patient (film)
Flipper (1996 film)
Executive Decision
Fly Away Home
The Ghost and the Darkness
Independence Day (1996 film)
Le Jaguar
The Island of Dr. Moreau (1996 film)
Matilda (1996 film)
Mission: Impossible (film)
The NeverEnding Story III
Muppet Treasure Island
The Phantom (1996 film)
Homeward Bound II: Lost in San Francisco
The Quest (film)
White Squall (film)
20,000 Leagues Under the Sea (1997 miniseries)
Space Jam
Amistad (film)
Star Trek: First Contact
Anaconda (film)The Call of the Wild: Dog of the Yukon

Austin Powers: International Man of Mystery
Eye of the Eagle (1997 film)
The Edge (1997 film)
The Fifth Element
Into Thin Air: Death on Everest
Dante's Peak
Joey (1997 film)
The Lost World: Jurassic Park
Kull the Conqueror
Passion in the Desert
Men in Bl

Allan Quatermain and the Temple of Skulls
Asterix at the Olympic Games (film)
The Chronicles of Narnia: Prince Caspian
Female Agents
City of Ember
The Flyboys (film)
Dachimawa Lee
The Forbidden Kingdom
The Good, the Bad, the Weird
Inkheart (film)
Indiana Jones and the Kingdom of the Crystal Skull
Fool's Gold (2008 film)
Journey to the Center of the Earth (2008 theatrical film)
Jumper (2008 film)
The Lost Treasure of the Grand Canyon
The Letter for the King (film)
The Librarian: Curse of the Judas Chalice
Merlin and the War of the Dragons
The Mummy: Tomb of the Dragon Emperor
The Red Baron (2008 film)
Outlander (film)
Rambo (2008 film)
Quantum of Solace
The Scorpion King 2: Rise of a Warrior
Roadside Romeo
Red Cliff (film)
Star Wars: The Clone Wars (film)
Strange Wilderness
The Tale of Despereaux (film)
Nim's Island
North Face (film)
9 (2009 animated film)
2012 (film)
Barbarossa (film)
Agora (film)
WALL-E
Dragonball Evolution
Avatar (2009 film)
Ice Age: Dawn of the Dinosaurs
The Imagina

Star Wars: The Rise of Skywalker
Red Notice (film)
The Last Duel (2021 film)
Jungle Cruise (film)
L'Odissea (1911 film)
Le avventure straordinarissime di Saturnino Farandola
McVeagh of the South Seas
The Explorer (film)
Filibus
Peer Gynt (1915 film)
Under the Crescent
20,000 Leagues Under the Sea (1916 film)
Under Two Flags (1916 film)
The Man Without a Country (1917 film)
Tom Sawyer (1917 film)
When a Man Sees Red
Bound in Morocco
Eye for Eye (film)
The Romance of Tarzan
Around the World in Eighty Days (1919 film)
The White Man's Law
The Elusive Pimpernel (1919 film)
Tarzan of the Apes (1918 film)
The Mistress of the World
Soldiers of Fortune (1919 film)
The Spiders (film)
Victory (1919 film)
Huckleberry Finn (1920 film)
The Last of the Mohicans (1920 American film)
The Mark of Zorro (1920 film)
The Sea Wolf (1920 film)
The Revenge of Tarzan
The Spiders (film)
The Son of Tarzan (film)
Terror Island
Treasure Island (1920 film)
L'Atlantide (1921 film)
The Adventures of Tarzan
The Devil 

Jungle Queen (serial)
The Naughty Nineties
Jungle Raiders (serial)
To Have and Have Not (film)
Objective, Burma!
The Spanish Main
A Thousand and One Nights (1945 film)
The Wicked Lady
The Bandit of Sherwood Forest
Caravan (1946 film)
The Fighting Guardsman
Lost City of the Jungle
Night Boat to Dublin
Monsieur Beaucaire (1946 film)
Road to Utopia
The Overlanders (film)
A Scandal in Paris
Strange Voyage
Son of the Guardsman
Swamp Fire
Tangier (1946 film)
Tarzan and the Leopard Woman
The Wife of Monte Cristo
Blonde Savage
Adventure Island (film)
Bullet for Stefano
Brick Bradford (serial)
Calcutta (1947 film)
Bush Christmas (1947 film)
Captain from Castile
The Exile (1947 film)
Mandrin (1947 film)
The Man Within (film)
Jungle Flight
The Macomber Affair
Forever Amber (film)
Queen of the Amazons
The Royalists
Road to Rio
Sinbad the Sailor (1947 film)
Singapore (1947 film)
Slave Girl (film)
Song of Scheherazade
Tarzan and the Huntress
Unconquered (1947 film)
16 Fathoms Deep
Adventures of Don 

In [121]:
test_link = "https://en.wikipedia.org/wiki/Million_Dollar_Baby"
trial_film = pd.DataFrame(fetching_film_info(test_link), index=[df.index[-1]+1])
trial_film

Million Dollar Baby


Unnamed: 0,Release Year,Title,Director,Cast,Plot
34886,2004,Million Dollar Baby,Clint Eastwood,"Clint Eastwood, Hilary Swank, Morgan Freeman, ...","Margaret ""Maggie"" Fitzgerald (Hilary Swank), a..."


# Classification

### SVM

In [10]:
from sklearn.model_selection import train_test_split

X = df['Plot'].values
y = df['Genre2'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [11]:
len(X_train),len(y_train),len(X_test),len(y_test)

(21029, 21029, 7010, 7010)

In [12]:
import nltk

nlp = spacy.load('en_core_web_sm')

doc_counter = 0
def reset_counter():
  global doc_counter
  doc_counter = 0

def increase_counter():
  global doc_counter
  doc_counter += 1
  if doc_counter % 100 == 0:
    print(doc_counter)

def spacy_nlp_tokenizer(text):
    increase_counter()
        
    # we use spacy for main nlp tasks
    doc = nlp(text)
    # lemmatized tokens, skipping stopwords
    lemmas = ['LEMMA_'+token.lemma_ for token in doc if not token.is_stop]
    # entity_types
    entity_types = ['NER_'+token.ent_type_ for token in doc if token.ent_type_]

    # in case an entity linker is available, we can use it do put actual entities as
    # features, e.g. Queen Elizabeth, Elizabeth II, Her Majesty -> KB2912
    # see https://spacy.io/usage/training#entity-linker
    # entities = ['ENT_'+token.ent_kb_id_ for token in doc if token.ent_kb_id_]

    # we use a simple nltk function to create ngrams
    lemma_bigrams = ['BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(lemmas,2)]
    lemma_trigrams = ['TRI_'+p1+'_'+p2+'_'+p3 for p1,p2,p3 in nltk.ngrams(lemmas,3)]

    all_tokens = list()
    all_tokens.extend(lemmas)
    all_tokens.extend(lemma_bigrams)
    all_tokens.extend(lemma_trigrams)
    all_tokens.extend(entity_types)
    return all_tokens

In [13]:
vect = CountVectorizer(analyzer=spacy_nlp_tokenizer, min_df=5)

X_train_tok = vect.fit_transform(X_train)

X_test_tok = vect.transform(X_test)

100
200
300
400
500
600
700
800
900


KeyboardInterrupt: 

In [None]:
X_train_tok.shape, X_test_tok.shape

In [None]:
import pickle

with open('x_train_tok.pkl','wb') as outfile:
    pickle.dump(X_train_tok,outfile)
with open('x_test_tok.pkl','wb') as outfile:
    pickle.dump(X_test_tok,outfile)

In [12]:
import pickle

infile = open('x_train_tok.pkl','rb')
X_train_tok = pickle.load(infile)
infile.close()

infile = open('x_test_tok.pkl','rb')
X_test_tok = pickle.load(infile)
infile.close()

In [13]:
X_train_tok.shape, X_test_tok.shape

((21029, 6806178), (7010, 6806178))

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'sel__k': [1000, 2000, 5000, 10000], 'learner__C': [0.01, 0.1, 1, 10]},
 ]

opt_pipeline = Pipeline([
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])


n_jobs = 2

opt_search = GridSearchCV(opt_pipeline, param_grid, cv=5, n_jobs = n_jobs, verbose=3).fit(X_train_tok,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [15]:
opt_predictions = opt_search.best_estimator_.predict(X_test_tok)

correct = 0
for prediction,true_label in zip(opt_predictions, y_test):
    if prediction==true_label:
        correct += 1
print(correct/len(opt_predictions))

0.2905848787446505


In [16]:
from sklearn.metrics import confusion_matrix, classification_report
print('Classification report:')
print(classification_report(y_test, opt_predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, opt_predictions)
print(cm)

Classification report:
                 precision    recall  f1-score   support

         action       0.00      0.00      0.00       516
      adventure       0.00      0.00      0.00       166
      animation       0.00      0.00      0.00       205
           arts       0.00      0.00      0.00        18
      biography       0.00      0.00      0.00        55
         comedy       0.24      0.02      0.04      1636
    documentary       0.00      0.00      0.00        23
          drama       0.29      0.98      0.45      2041
         family       0.00      0.00      0.00        81
        fantasy       0.00      0.00      0.00        78
        fiction       0.00      0.00      0.00       117
     historical       0.00      0.00      0.00        63
         horror       0.00      0.00      0.00       342
        musical       0.00      0.00      0.00       139
           noir       0.00      0.00      0.00        86
        romance       0.00      0.00      0.00       335
science

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
