In [2]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [3]:
df = pd.read_csv('wiki_movie_plots_deduped.csv')
df.shape

(34886, 8)

In [4]:
df = df[df['Genre'] != 'unknown']
df.shape

(28803, 8)

In [5]:
df.head(60)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...
10,1906,Dream of a Rarebit Fiend,American,Wallace McCutcheon and Edwin S. Porter,,short,https://en.wikipedia.org/wiki/Dream_of_a_Rareb...,The Rarebit Fiend gorges on Welsh rarebit at a...
11,1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_t...,The film features a train traveling through th...
12,1906,Kathleen Mavourneen,American,Edwin S. Porter,,short film,https://en.wikipedia.org/wiki/Kathleen_Mavourn...,Irish villager Kathleen is a tenant of Captain...
13,1907,Daniel Boone,American,Wallace McCutcheon and Ediwin S. Porter,"William Craven, Florence Lawrence",biographical,https://en.wikipedia.org/wiki/Daniel_Boone_(19...,Boone's daughter befriends an Indian maiden as...
14,1907,How Brown Saw the Baseball Game,American,Unknown,Unknown,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_th...,Before heading out to a baseball game at a nea...
15,1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(fi...,The plot is that of a black woman going to the...
16,1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,https://en.wikipedia.org/wiki/The_Adventures_o...,On a beautiful summer day a father and mother ...
17,1908,The Black Viper,American,D. W. Griffith,D. W. Griffith,drama,https://en.wikipedia.org/wiki/The_Black_Viper,A thug accosts a girl as she leaves her workpl...


In [6]:
df['Genre'].nunique()

2264

In [7]:
# possible converting rules:

# 0. convert all [genre film] to just [genre], i.e. drop the 'film' for homogeneity

# 1. subgenre-macrogenre taxonomy.

# 1.1. dictionary-based conversion readacting Wikipedia's Film genre page
#  disaster, martial arts, spy, superhero > ACTION
#  pirate, swashbuckler, samurai > ADVENTURE
#  cgi, cutout, live-action animated film, stop motion, traditional animation, (animated) > ANIMATION
#  buddy, mockumentary, parody, slapstick > COMEDY
#  ghost, monster, vampire, werewolf, slash, splatter, zombie > HORROR
#  dystopian, post-apocalyptic, steampunk, tech noir, utopian > SCIENCE FICTION
#  mystery, detective, crime > THRILLER
#  operetta > MUSICAL

# 1.2. automatic conversion using the presence of the wider genre in the name of the narrower one:
#  (something) comedy > COMEDY
#  (something) drama > DRAMA
#  (something) fantasy > FANTASY
#   historic(al) (stuff) > historical
#   space, sci-fi, scifi > science fiction
#  (something) thriller > thriller
#  (something) western > western

# 2. [genre, genre] and [genre/genre] cases:
#  consider only the first one. e.g. (crime, film noir) > crime
#  if not present, consider the second one 

# 3. [genre genre] cases:
#   consider the second one.
#  if not present, consider the first one

# PIPELINE TO REIMPLEMENT AS LINKED FUNCTIONS:
# 1. Remove commas and slashes cases (2)
# 2. Remove "film" at the end (0)
# 3. Subgenre-macrogenre taxonomy conversion (1)
# 4. 

In [23]:
# 1.1. dictionary-based conversion readacting Wikipedia's Film genre page:

conversion_dict = {
    "action": ["disaster", "martial arts", "spy", "superhero", "wuxia","action","masala","espionage"],
    "adventure": ["pirate", "swashbuckler", "samurai"],
    "animation": ["cgi", "cutout", "live-action animated film", "stop motion", "animated", "computer-animated", "anime"],
    "comedy": ["buddy", "mockumentary", "parody", "slapstick"],
    "drama": ["docudrama", "melodrama", "biodrama", "bio-drama"],
    "historical": ["history", "historic", "alternate history", "period", "period piece", "biopic", "bio-pic", "biographical"],
    "horror": ["ghost", "monster", "vampire", "werewolf", "slash", "splatter", "zombie", "j-horror","supernatural"],
    "science fiction": ["dystopian", "dystopia", "fiction", "post-apocalyptic", "steampunk", "tech noir", "utopian", "science-fiction", "scifi", "sci-fi", "space", "tokusatsu"],
    "thriller": ["mystery", "detective", "crime","suspense"],
    "musical": ["operetta"],
    "romance": ["love","romantic"],
    "western": ["cowboy"],
    "documentary": ["pseudo-documentary"],
    "fantasy":[],
    "sport":["sports","races","dance","biker"],
    "war":['ii','i'],
    "erotic":['ero','adult','erotic','sexploitation'],
    "social":['socio','costume']
}

In [24]:
import re

def genres_preprocessing(genres_dict, genre):
    stop = ['film','short']
    for w in stop:
        if w in genre:
            genre = genre.replace(w,'').strip()
        
    
    splitted = re.split("[,/]", genre)
    if len(splitted) != 1:
        genre = splitted[0]
        
    splitted = re.split("[-—–]", genre)
    if any(item in genres_dict.keys() for item in splitted):
    #if splitted[0] in genres_dict.keys():
        genre = splitted[0]
    
    splitted = re.split(" ", genre.rstrip())
    if len(splitted) != 1:
        genre = splitted[-1]
        
    for key in genres_dict.keys():
        if genre.rstrip() in genres_dict[key]:
            genre = key
        
    return genre.rstrip()

In [25]:
df['Genre2'] = df['Genre']
df['Genre2'] = df['Genre2'].apply(lambda x: genres_preprocessing(conversion_dict, x))

In [26]:
counts = df['Genre2'].value_counts()
to_remove = counts[counts < 50].index
    
df = df[~df.Genre2.isin(to_remove)]

drop_id = df[df.Plot.apply(lambda x: len(x) < 25)].index
df.drop(drop_id, inplace=True)

df = df[df['Genre2'] != '']

In [27]:
df.Genre2.value_counts()

drama              8163
comedy             6545
thriller           2535
action             2064
horror             1369
romance            1339
western             915
animation           819
science fiction     779
adventure           662
musical             555
war                 421
noir                345
family              324
fantasy             313
historical          254
biography           220
social              113
documentary          91
serial               80
arts                 72
sport                61
Name: Genre2, dtype: int64

In [30]:
test_df = df[df.Genre2 == 'fantasy']
test_df.head(50)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Genre2
32,1910,Pocahontas,American,Unknown,"Anna Rosemond, George Barnes, Frank H. Crane",short fantasy,https://en.wikipedia.org/wiki/Pocahontas_(1910...,"Though the film is presumed lost, a synopsis s...",fantasy
92,1914,Neptune's Daughter,American,Herbert Brenon,"Annette Kellerman, William E. Shay, William Welsh",fantasy,https://en.wikipedia.org/wiki/Neptune%27s_Daug...,The daughter of King Neptune takes on human fo...,fantasy
198,1918,The Blue Bird,American,Maurice Tourneur,Tula Belle,fantasy,https://en.wikipedia.org/wiki/The_Blue_Bird_(1...,When poor old widow Berlingot asks Tyltyl and ...,fantasy
211,1918,The Ghost of Slumber Mountain,American,Willis O'Brien,"Herbert M. Dawley, Willis O'Brien",fantasy,https://en.wikipedia.org/wiki/The_Ghost_of_Slu...,Most of the full plot is unknown. In the versi...,fantasy
508,1924,Peter Pan,American,"Herbert Brenon, Glen Castle","Betty Bronson, Ernest Torrence, Virginia Brown...","fantasy, family",https://en.wikipedia.org/wiki/Peter_Pan_(1924_...,"In the story, Peter Pan, a magical boy who ref...",fantasy
538,1925,A Kiss for Cinderella,American,Herbert Brenon,"Betty Bronson, Tom Moore",fantasy,https://en.wikipedia.org/wiki/A_Kiss_for_Cinde...,"In London during World War One, a simple-minde...",fantasy
542,1925,The Lost World,American,Harry Hoyt,"Bessie Love, Wallace Beery.","fantasy, adventure",https://en.wikipedia.org/wiki/The_Lost_World_(...,From a lost expedition to a plateau in Venezue...,fantasy
570,1925,Wizard of Oz,American,Larry Semon,Dorothy Dwan,fantasy,https://en.wikipedia.org/wiki/Wizard_of_Oz_(19...,A toymaker (Semon) reads L. Frank Baum's book ...,fantasy
603,1926,A Kiss for Cinderella,American,Herbert Brenon,"Betty Bronson, Esther Ralston, Dorothy Cumming","fantasy, romance",https://en.wikipedia.org/wiki/A_Kiss_for_Cinde...,"In London during World War One, a simple-minde...",fantasy
615,1926,The Sorrows of Satan,American,D. W. Griffith,"Adolphe Menjou, Ricardo Cortez, Carol Dempster...",fantasy,https://en.wikipedia.org/wiki/The_Sorrows_of_S...,Adolphe Menjou stars as Prince Lucio de Rimane...,fantasy


In [524]:
g = df.groupby("Genre2")
new_df = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

### Modello

In [541]:
from sklearn.model_selection import train_test_split

X = new_df['Plot'].values[:2000]
y = new_df['Genre2'].values[:2000]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True)

In [542]:
len(X_train),len(y_train),len(X_test),len(y_test)

(982, 982, 421, 421)

In [543]:
import nltk

nlp = spacy.load('en_core_web_sm')

def spacy_nlp_tokenizer(text):

    # we use spacy for main nlp tasks
    doc = nlp(text)
    # lemmatized tokens, skipping stopwords
    lemmas = ['LEMMA_'+token.lemma_ for token in doc if not token.is_stop]
    # entity_types
    entity_types = ['NER_'+token.ent_type_ for token in doc if token.ent_type_]

    # in case an entity linker is available, we can use it do put actual entities as
    # features, e.g. Queen Elizabeth, Elizabeth II, Her Majesty -> KB2912
    # see https://spacy.io/usage/training#entity-linker
    # entities = ['ENT_'+token.ent_kb_id_ for token in doc if token.ent_kb_id_]

    # we use a simple nltk function to create ngrams
    lemma_bigrams = ['BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(lemmas,2)]
    lemma_trigrams = ['TRI_'+p1+'_'+p2+'_'+p3 for p1,p2,p3 in nltk.ngrams(lemmas,3)]

    all_tokens = list()
    all_tokens.extend(lemmas)
    all_tokens.extend(lemma_bigrams)
    all_tokens.extend(lemma_trigrams)
    all_tokens.extend(entity_types)
    return all_tokens

In [544]:
vect = CountVectorizer(analyzer=spacy_nlp_tokenizer, min_df=5)  

X_train_tok = vect.fit_transform(X_train)

X_test_tok = vect.transform(X_test)

In [547]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC

pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=3000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])

classifier = pipeline.fit(X_train_tok,y_train)
predictions = classifier.predict(X_test_tok)

correct = 0
for prediction,true_label in zip(predictions, y_test):
    if prediction==true_label:
        correct += 1
print(correct/len(predictions))

0.3515439429928741


In [548]:
from sklearn.metrics import confusion_matrix, classification_report
print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

Classification report:
                 precision    recall  f1-score   support

         action       0.19      0.18      0.18        17
      adventure       0.36      0.24      0.29        17
      animation       0.43      0.41      0.42        22
           arts       0.60      0.56      0.58        16
      biography       0.50      0.33      0.40        24
         comedy       0.17      0.10      0.12        20
    documentary       0.62      0.68      0.65        22
          drama       0.06      0.07      0.06        15
         family       0.11      0.06      0.08        16
        fantasy       0.17      0.13      0.15        23
        fiction       0.16      0.19      0.17        16
     historical       0.14      0.25      0.18        16
         horror       0.30      0.58      0.40        12
        musical       0.47      0.37      0.41        19
           noir       0.34      0.59      0.43        17
        romance       0.37      0.30      0.33        23
science