In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [2]:
df = pd.read_csv('wiki_movie_plots_deduped.csv')
df.shape

(34886, 8)

# Preprocessing

In [3]:
df = df[df['Genre'] != 'unknown']
df.shape

(28803, 8)

In [4]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
10,1906,Dream of a Rarebit Fiend,American,Wallace McCutcheon and Edwin S. Porter,,short,https://en.wikipedia.org/wiki/Dream_of_a_Rareb...,The Rarebit Fiend gorges on Welsh rarebit at a...
11,1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_t...,The film features a train traveling through th...
12,1906,Kathleen Mavourneen,American,Edwin S. Porter,,short film,https://en.wikipedia.org/wiki/Kathleen_Mavourn...,Irish villager Kathleen is a tenant of Captain...


In [5]:
df['Genre'].nunique()

2264

In [6]:
# 1.1. dictionary-based conversion readacting Wikipedia's Film genre page:

conversion_dict = {
    "action": ["disaster", "martial arts", "spy", "superhero", "wuxia","action","masala","espionage","arts"],
    "adventure": ["pirate", "swashbuckler", "samurai"],
    "animation": ["cgi", "cutout", "live-action animated film", "stop motion", "animated", "computer-animated", "anime"],
    "comedy": ["buddy", "mockumentary", "parody", "slapstick"],
    "drama": ["docudrama", "melodrama", "biodrama", "bio-drama"],
    "historical": ["history", "historic", "alternate history", "period", "period piece", "biopic", "bio-pic", "biographical"],
    "horror": ["ghost", "monster", "vampire", "werewolf", "slash", "splatter", "zombie", "j-horror","supernatural"],
    "science fiction": ["dystopian", "dystopia", "post-apocalyptic", "steampunk", "tech noir", "utopian", "science-fiction", "scifi", "sci-fi", "space", "tokusatsu","fiction"],
    "thriller": ["mystery", "detective", "crime","suspense"],
    "musical": ["operetta"],
    "romance": ["love","romantic"],
    "western": ["cowboy"],
    "documentary": ["pseudo-documentary"],
    "fantasy":[],
    "sport":["sports","races","dance","biker"],
    "war":['ii','i'],
    "erotic":['ero','adult','erotic','sexploitation'],
    "social":['socio','costume']
}

In [7]:
import re

#preprocessing function helper for genres reduction
def genres_preprocessing(genres_dict, genre):
    stop = ['film','short']
    for w in stop:
        if w in genre:
            genre = genre.replace(w,'').strip()
        
    
    splitted = re.split("[,/]", genre)
    if len(splitted) != 1:
        genre = splitted[0]
        
    splitted = re.split("[-—–]", genre)
    if any(item in genres_dict.keys() for item in splitted):
    #if splitted[0] in genres_dict.keys():
        genre = splitted[0]
    
    splitted = re.split(" ", genre.rstrip())
    if len(splitted) != 1:
        genre = splitted[-1]
        
    for key in genres_dict.keys():
        if genre.rstrip() in genres_dict[key]:
            genre = key
        
    return genre.rstrip()

In [8]:
df['Genre2'] = df['Genre']
df['Genre2'] = df['Genre2'].apply(lambda x: genres_preprocessing(conversion_dict, x))

In [9]:
counts = df['Genre2'].value_counts()
to_remove = counts[counts < 50].index

df = df[~df.Genre2.isin(to_remove)]

drop_id = df[df.Plot.apply(lambda x: len(x) < 25)].index
df.drop(drop_id, inplace=True)

df = df[df['Genre2'] != '']

In [10]:
#g = df.groupby("Genre2")
#new_df = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

# Scraping

In [11]:
vc = df['Genre2'].value_counts()
genres_to_fetch = vc[vc < 1000].index.tolist()
genres_to_fetch

['western',
 'animation',
 'science fiction',
 'adventure',
 'musical',
 'war',
 'noir',
 'family',
 'fantasy',
 'historical',
 'biography',
 'social',
 'documentary',
 'serial',
 'sport']

In [12]:
#LINKS GETTER FUNCTIONS
import requests
from bs4 import BeautifulSoup

def get_links(src, stw="/", genre=""):
    links = []
    soup = BeautifulSoup(src)
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and href.startswith(stw) and genre.replace(" ","_") in href.lower():
            links.append(href)
            
    return links

def request_links(base, in_links):
    out_links = []
    if type(in_links) == list:
        response = [ requests.get(base+link).text for link in set(in_links) ]
        for res in response:
            out_links.extend(get_links(res))

    else:
        response = requests.get(base+in_links).text
        out_links.extend(get_links(response))
        
    return out_links

In [189]:
#FETCHING FUNCTIONS
def fetching_title_plot(soup):
    try:
        if soup.find("span", {"id":"Plot"}):
            title = soup.find("h1", {"id":"firstHeading"}).text
            print(title)
            plot_span = soup.find("span", {"id":"Plot"})
            obj = plot_span.find_parent()
            plot = ""
            while True:
                obj = obj.next_sibling
                if obj.name not in ['p','h2']:
                    continue
                elif obj.name == 'p':
                    plot += obj.text.strip('\n')
                elif obj.name == 'h2':
                    break

            return title, plot

        else:
            return None, None
        
    except:
        return None, None

def fetching_cast(soup):
    try:
        starring = soup.find('th', string='Starring')
        if starring:
            starring_sib = starring.nextSibling
            cast = [star.get('title') for star in starring_sib.find_all('a')]
            
        else:
            cast = []
            
        return cast
    except:
        return []

def fetching_director_date(soup):
    director = soup.find("th", string='Directed by')
    date = soup.find("th", string='Release date')
    if director:
        director = director.nextSibling.text
        
    if date:
        date = date.nextSibling.text
        date = re.findall(r"\d{4}",date_sib)[0]

        
        
    return director, date


#FINAL FETCHING FUNCTION
def fetching_film_info(link):
    html = requests.get(link).text
    soup = BeautifulSoup(html)
    title, plot = fetching_title_plot(soup)
    director, date = fetching_director_date(soup)
    cast = fetching_cast(soup)
    film_info = {'Release Year': date, 'Title':title, 'Director':director, 'Cast': cast, 'Plot':plot} if plot is not None else None
    return film_info


In [166]:
#SCRAPING FUNCTIONS
import time
from concurrent.futures import ThreadPoolExecutor
import threading
from functools import partial

#scraping function
def plot_scraper(film_list, genre, link):
    f_link = "https://en.wikipedia.org"+link
    film_info = fetching_film_info(f_link)
    if film_info is not None:
        film_info.update({'Genre':genre})
        film_list.append(film_info)
    
#parallel scraping
def set_up_threads(links, film_list, genre):
    with ThreadPoolExecutor(max_workers=6) as executor:
        return executor.map( partial(plot_scraper, film_list, genre),
                             links,
                             timeout=30 )    
            

In [167]:
#test cell for film scraping with single genre
base = "https://en.wikipedia.org"
genres_link = "https://en.wikipedia.org/wiki/Template:Films_by_genre_sidebar"
main_res = requests.get(genres_link).text
genre_links = get_links(main_res, "/wiki/List", 'sport')
genre_sublinks = request_links(base, genre_links)

films = []
set_up_threads( genre_sublinks[:100], films, 'sport' )
films

The Champion (1915 film)
Horse Feathers
Raging Bull
The Great Macarthy
The Club (play)
Australian Rules (film)
Debbie Does Dallas
Blinder (film)
The Positively True Adventures of the Alleged Texas Cheerleader-Murdering Mom
A Friend to Die For
But I'm a Cheerleader
Bring It On (film)
All Cheerleaders Die (2001 film)
Gotta Kick It Up!
Sugar & Spice
Cheerleader Queens
Bring It On Again
Bring It On: All or Nothing
Bring It On: In It to Win It
Fab Five: The Texas Cheerleader Scandal


[{'Release Year': '1915',
  'Title': 'The Champion (1915 film)',
  'Director': 'Charles Chaplin',
  'Cast': ['Charles Chaplin',
   'Edna Purviance',
   'Ernest Van Pelt',
   'Shields and Yarnell',
   'Lloyd Bacon',
   'Leo White',
   'Carl Stockdale',
   'Billy Armstrong (actor)',
   'Paddy McGuire',
   'Bud Jamison',
   'Ben Turpin'],
  'Plot': 'In this comedy, Charlie Chaplin has a companion—a pet bulldog.  Walking along a street with his bulldog, Charlie finds a "good luck" horseshoe just as he passes the training camp of an enormous fighter named Spike Dugan.  Outside the camp is a large, painted advertisement which states Dugan is seeking sparring partners "who can take a punch." After watching other better fighters be soundly beaten by Dugan, Charlie decides his best bet is to put the horseshoe inside his boxing glove.  Using the loaded glove, Charlie connects with a solid punch and wins. The trainer prepares Charlie to fight the world champion. A gambler wants Charlie to throw t

In [None]:
#main scraping cycle
base = "https://en.wikipedia.org"
genres_link = "https://en.wikipedia.org/wiki/Template:Films_by_genre_sidebar"
main_res = requests.get(genres_link).text

#final films list to populate with new scraped films
films_list = []

for genre in genres_to_fetch:
    genre_links = get_links(main_res, "/wiki/List", genre)
    for link in genre_links:
        genre_sublinks = request_links(base, link)
        if "Lists" in link:
            final_genre_sublinks = []
            print(genre.upper())
            for fl in genre_sublinks:
                genre_to_match = genre.title() if genre == 'western' else genre.replace(" ", "_")
                genre_to_match = "List_of_"+genre_to_match+"_films"
                if genre_to_match in fl:
                    real_fl = request_links(base, fl)
                    final_genre_sublinks.extend(real_fl)
            set_up_threads(final_genre_sublinks, films_list, genre)
        else:
            print(genre.upper())
            set_up_threads(genre_sublinks, films_list, genre)
            

WESTERN
The Great Train Robbery (1903 film)
The Comancheros (film)
Kidnapping by Indians
Kit Carson (1903 film)
The Great Train Robbery (1903 film)
The Pioneers (1903 film)
Bushranging in North Queensland
The Hold-Up of the Leadville Stage
The Little Train Robbery
From Leadville to Aspen: A Hold-Up in the Rockies
The Train Wreckers
The Story of the Kelly Gang
The Fight for Freedom
The Cowboy Millionaire (1909 film)
The Last Drop of Water
Algie the Miner
Geronimo's Last Raid
The Girl and Her Trust
The Battle at Elderbush Gulch
Rose of the Rancho
Her Grave Mistake
A Miner's Romance
The Squaw Man (1914 film)
A Ranch Romance
The Tragedy of Whispering Creek
The Girl of the Golden West (1915 film)
The Desert Breed
Accusing Evidence
Hell's Hinges
The Committee on Credentials
Love's Lariat
Liberty (serial)
The Return of Draw Egan
A 44-Calibre Mystery
The Bad Man of Cheyenne
The Fighting Trail
The Empty Gun
A Marked Man
The Narrow Trail
Straight Shooting
The Silent Man (film)
Six-Shooter Justic

In [121]:
test_link = "https://en.wikipedia.org/wiki/Million_Dollar_Baby"
trial_film = pd.DataFrame(fetching_film_info(test_link), index=[df.index[-1]+1])
trial_film

Million Dollar Baby


Unnamed: 0,Release Year,Title,Director,Cast,Plot
34886,2004,Million Dollar Baby,Clint Eastwood,"Clint Eastwood, Hilary Swank, Morgan Freeman, ...","Margaret ""Maggie"" Fitzgerald (Hilary Swank), a..."


# Classification

### SVM

In [10]:
from sklearn.model_selection import train_test_split

X = df['Plot'].values
y = df['Genre2'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [11]:
len(X_train),len(y_train),len(X_test),len(y_test)

(21029, 21029, 7010, 7010)

In [12]:
import nltk

nlp = spacy.load('en_core_web_sm')

doc_counter = 0
def reset_counter():
  global doc_counter
  doc_counter = 0

def increase_counter():
  global doc_counter
  doc_counter += 1
  if doc_counter % 100 == 0:
    print(doc_counter)

def spacy_nlp_tokenizer(text):
    increase_counter()
        
    # we use spacy for main nlp tasks
    doc = nlp(text)
    # lemmatized tokens, skipping stopwords
    lemmas = ['LEMMA_'+token.lemma_ for token in doc if not token.is_stop]
    # entity_types
    entity_types = ['NER_'+token.ent_type_ for token in doc if token.ent_type_]

    # in case an entity linker is available, we can use it do put actual entities as
    # features, e.g. Queen Elizabeth, Elizabeth II, Her Majesty -> KB2912
    # see https://spacy.io/usage/training#entity-linker
    # entities = ['ENT_'+token.ent_kb_id_ for token in doc if token.ent_kb_id_]

    # we use a simple nltk function to create ngrams
    lemma_bigrams = ['BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(lemmas,2)]
    lemma_trigrams = ['TRI_'+p1+'_'+p2+'_'+p3 for p1,p2,p3 in nltk.ngrams(lemmas,3)]

    all_tokens = list()
    all_tokens.extend(lemmas)
    all_tokens.extend(lemma_bigrams)
    all_tokens.extend(lemma_trigrams)
    all_tokens.extend(entity_types)
    return all_tokens

In [13]:
vect = CountVectorizer(analyzer=spacy_nlp_tokenizer, min_df=5)

X_train_tok = vect.fit_transform(X_train)

X_test_tok = vect.transform(X_test)

100
200
300
400
500
600
700
800
900


KeyboardInterrupt: 

In [None]:
X_train_tok.shape, X_test_tok.shape

In [None]:
import pickle

with open('x_train_tok.pkl','wb') as outfile:
    pickle.dump(X_train_tok,outfile)
with open('x_test_tok.pkl','wb') as outfile:
    pickle.dump(X_test_tok,outfile)

In [12]:
import pickle

infile = open('x_train_tok.pkl','rb')
X_train_tok = pickle.load(infile)
infile.close()

infile = open('x_test_tok.pkl','rb')
X_test_tok = pickle.load(infile)
infile.close()

In [13]:
X_train_tok.shape, X_test_tok.shape

((21029, 6806178), (7010, 6806178))

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'sel__k': [1000, 2000, 5000, 10000], 'learner__C': [0.01, 0.1, 1, 10]},
 ]

opt_pipeline = Pipeline([
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])


n_jobs = 2

opt_search = GridSearchCV(opt_pipeline, param_grid, cv=5, n_jobs = n_jobs, verbose=3).fit(X_train_tok,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [15]:
opt_predictions = opt_search.best_estimator_.predict(X_test_tok)

correct = 0
for prediction,true_label in zip(opt_predictions, y_test):
    if prediction==true_label:
        correct += 1
print(correct/len(opt_predictions))

0.2905848787446505


In [16]:
from sklearn.metrics import confusion_matrix, classification_report
print('Classification report:')
print(classification_report(y_test, opt_predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, opt_predictions)
print(cm)

Classification report:
                 precision    recall  f1-score   support

         action       0.00      0.00      0.00       516
      adventure       0.00      0.00      0.00       166
      animation       0.00      0.00      0.00       205
           arts       0.00      0.00      0.00        18
      biography       0.00      0.00      0.00        55
         comedy       0.24      0.02      0.04      1636
    documentary       0.00      0.00      0.00        23
          drama       0.29      0.98      0.45      2041
         family       0.00      0.00      0.00        81
        fantasy       0.00      0.00      0.00        78
        fiction       0.00      0.00      0.00       117
     historical       0.00      0.00      0.00        63
         horror       0.00      0.00      0.00       342
        musical       0.00      0.00      0.00       139
           noir       0.00      0.00      0.00        86
        romance       0.00      0.00      0.00       335
science

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
