In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk

from nltk.corpus import stopwords
from nltk.metrics.distance import jaccard_distance

from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.stats import spearmanr

## 4. Keyword Extraction From Overview

### 4.1. Data load and preprocessing

In [2]:
imdb_data = pd.read_csv("imdb_top_1000.csv")

stopwords_eng = stopwords.words("english")

In [3]:
imdb_data["Gross"] = imdb_data["Gross"].apply(lambda x: float(x.replace(",", "")) if isinstance(x, str) else x)
imdb_data["Released_Year"] = pd.to_datetime(imdb_data["Released_Year"], format="%Y", errors="coerce")
imdb_data["Runtime"] = imdb_data["Runtime"].apply(lambda x: int(x.split()[0]))
imdb_data["Genre"] = imdb_data["Genre"].apply(lambda x: x.split(", "))

In [4]:
lemmatizer = nltk.WordNetLemmatizer()

In [5]:
imdb_data["Overview_Processed"] = (
    imdb_data["Overview"].apply(lambda x: x.lower()).apply(nltk.word_tokenize)
)
imdb_data["Overview_Processed"] = imdb_data["Overview_Processed"].apply(
    lambda x: [word for word in x if word not in stopwords_eng]
)
imdb_data["Overview_Processed"] = imdb_data["Overview_Processed"].apply(
    lambda x: [word for word in x if word.isalnum()]
)
imdb_data["Overview_Processed"] = imdb_data["Overview_Processed"].apply(
    lambda x: [word for word in x if not any(char.isdigit() for char in word)]
)
imdb_data["Overview_Processed"] = imdb_data["Overview_Processed"].apply(
    lambda x: [lemmatizer.lemmatize(word) for word in x]
)

### 4.2. Keyword extraction

We fit the TF-IDF transformer using all the overviews in the database as vocabulary, after preprocessing.

In [6]:
all_overviews = imdb_data.Overview_Processed.apply(lambda x: ' '.join(x)).tolist()

tfidf_vectorizer = TfidfVectorizer(max_df=0.90, stop_words=stopwords_eng)
tfidf_vector = tfidf_vectorizer.fit_transform(all_overviews)

feature_names = tfidf_vectorizer.get_feature_names_out()

We then get the top 10 keywords for each overview in the database.

In [7]:
def get_top_keywords_for_overview(overview, feature_names=feature_names, top_n=10):
    doc = " ".join(overview)

    tfidf_vector = tfidf_vectorizer.transform([doc])

    coord_matrix_rep = tfidf_vector.tocoo()
    index_score_pairs = zip(coord_matrix_rep.col, coord_matrix_rep.data)
    sorted_pairs = sorted(index_score_pairs, key=lambda x: (x[1], x[0]), reverse=True)

    sorted_pairs = sorted_pairs[:top_n]

    keywords = []
    for index, _ in sorted_pairs:
        keywords.append(feature_names[index])

    return keywords

In [8]:
imdb_data['Top_Keywords'] = imdb_data['Overview_Processed'].apply(get_top_keywords_for_overview)

imdb_data[['Overview', 'Top_Keywords']].head()

Unnamed: 0,Overview,Top_Keywords
0,Two imprisoned men bond over a number of years...,"[eventual, decency, solace, number, common, fi..."
1,An organized crime dynasty's aging patriarch t...,"[transfer, organized, dynasty, clandestine, pa..."
2,When the menace known as the Joker wreaks havo...,"[injustice, accept, wreaks, test, psychologica..."
3,The early life and career of Vito Corleone in ...,"[vito, tightens, portrayed, grip, expands, cor..."
4,A jury holdout attempts to prevent a miscarria...,"[reconsider, miscarriage, jury, holdout, evide..."


### 4.3. Keyword comparison

As this dataset only contains the top highest rated movies, we will do the comparison with the top and bottom 20% of rated movies.

In [9]:
total_entries = imdb_data.shape[0]

num_high_rated = total_entries // 5
num_low_rated = total_entries // 5

high_rated_movies = imdb_data.sort_values(by="IMDB_Rating", ascending=False).head(num_high_rated)
low_rated_movies = imdb_data.sort_values(by="IMDB_Rating", ascending=True).head(num_low_rated)

In [10]:
high_rated_movies.shape, low_rated_movies.shape

((200, 18), (200, 18))

#### 4.3.1 High-rated movies

In [11]:
high_rated_movies[['Overview', 'Top_Keywords']].head()

Unnamed: 0,Overview,Top_Keywords
0,Two imprisoned men bond over a number of years...,"[eventual, decency, solace, number, common, fi..."
1,An organized crime dynasty's aging patriarch t...,"[transfer, organized, dynasty, clandestine, pa..."
2,When the menace known as the Joker wreaks havo...,"[injustice, accept, wreaks, test, psychologica..."
3,The early life and career of Vito Corleone in ...,"[vito, tightens, portrayed, grip, expands, cor..."
4,A jury holdout attempts to prevent a miscarria...,"[reconsider, miscarriage, jury, holdout, evide..."


We will calculate a frequency dataframe to get an idea of the most frequent keywords in this subset.

In [12]:
high_rated_movies_keywords = high_rated_movies['Top_Keywords'].explode()
high_rated_movies_keywords = high_rated_movies_keywords[high_rated_movies_keywords.notnull()]
high_rated_movies_keywords = high_rated_movies_keywords.value_counts().reset_index()
high_rated_movies_keywords.columns = ['Keyword', 'Frequency']
high_rated_movies_keywords = high_rated_movies_keywords.sort_values(by="Frequency", ascending=False)
high_rated_movies_keywords["Rank"] = high_rated_movies_keywords.reset_index().index + 1

In [13]:
high_rated_movies_keywords.shape

(1512, 3)

We see that the top keywords in the high-rated movies are related to crime and possibly war.

In [14]:
high_rated_movies_keywords.head(10)

Unnamed: 0,Keyword,Frequency,Rank
0,crime,7,1
1,find,6,2
2,german,5,3
3,bandit,5,4
4,life,5,5
5,nazi,5,6
6,try,5,7
15,ruthless,4,8
23,city,4,9
22,family,4,10


#### 4.3.2. Low-rated movies

In [15]:
low_rated_movies[['Overview', 'Top_Keywords']].head()

Unnamed: 0,Overview,Top_Keywords
999,A man in London tries to help a counter-espion...,"[agent, information, man, killed, top, stand, ..."
908,Dave Lizewski is an unnoticed high school stud...,"[unnoticed, though, meaningful, lizewski, dave..."
909,The story of two men on different sides of a p...,"[ordeal, rebellion, pose, inmate, riot, revolt..."
910,Oakland A's general manager Billy Beane's succ...,"[oakland, lean, employing, budget, beane, anal..."
911,"A brilliant plastic surgeon, haunted by past t...","[withstands, volatile, type, plastic, obsessio..."


In [16]:
low_rated_movies_keywords = low_rated_movies['Top_Keywords'].explode()
low_rated_movies_keywords = low_rated_movies_keywords[low_rated_movies_keywords.notnull()]
low_rated_movies_keywords = low_rated_movies_keywords.value_counts().reset_index()
low_rated_movies_keywords.columns = ['Keyword', 'Frequency']
low_rated_movies_keywords = low_rated_movies_keywords.sort_values(by="Frequency", ascending=False)
low_rated_movies_keywords["Rank"] = low_rated_movies_keywords.reset_index().index + 1

In [17]:
low_rated_movies_keywords.shape

(1563, 3)

The lower rated movies have top keywords related to people's relationships, like family and romance.

In [18]:
low_rated_movies_keywords.head(10)

Unnamed: 0,Keyword,Frequency,Rank
0,family,10,1
1,two,6,2
2,relationship,5,3
3,love,5,4
4,life,5,5
11,get,4,6
16,wife,4,7
14,become,4,8
13,year,4,9
12,travel,4,10


#### 4.3.3. Comparison

We will analyze the keywords both subsets have in common.

In [19]:
merged_keywords = high_rated_movies_keywords.merge(
    low_rated_movies_keywords,
    on="Keyword",
    how="inner",
    suffixes=("_High_Rated", "_Low_Rated"),
)

Most of the terms with high frequency in the highest rated movies have low frequency in the lower rated movies. This can mean there are many war related movies in the top movies.

In [20]:
merged_keywords.sort_values(by=["Rank_High_Rated"], ascending=True).head(10)

Unnamed: 0,Keyword,Frequency_High_Rated,Rank_High_Rated,Frequency_Low_Rated,Rank_Low_Rated
0,crime,7,1,1,687
1,find,6,2,2,106
2,german,5,3,1,921
3,bandit,5,4,2,146
4,life,5,5,5,5
5,nazi,5,6,1,775
6,try,5,7,1,1348
7,ruthless,4,8,1,632
8,city,4,9,2,169
9,family,4,10,10,1


The difference in frequencies is not so as marked with the terms of lower rated movies. This might be because drama is the most popular genre throughout the dataset so words related to drama will appear frequently in all movies.

In [21]:
merged_keywords.sort_values(by=["Rank_Low_Rated"], ascending=True).head(10)

Unnamed: 0,Keyword,Frequency_High_Rated,Rank_High_Rated,Frequency_Low_Rated,Rank_Low_Rated
9,family,4,10,10,1
13,two,4,14,6,2
47,relationship,3,76,5,3
312,love,1,937,5,4
4,life,5,5,5,5
111,get,2,207,4,6
70,wife,2,126,4,7
130,become,2,235,4,8
76,year,2,135,4,9
114,travel,2,215,4,10


#### 4.3.3.1. Jaccard similarity

In [22]:
def jaccard_similarity(set1, set2):
    return 1 - jaccard_distance(set1, set2)

In [23]:
jaccard_similarity = jaccard_similarity(
    set(high_rated_movies_keywords['Keyword']),
    set(low_rated_movies_keywords['Keyword'])
)

In [24]:
jaccard_similarity

0.17366412213740456

#### 4.3.3.2. Rank correlation

In [25]:
spearman_corr, _ = spearmanr(merged_keywords["Rank_High_Rated"], merged_keywords["Rank_Low_Rated"])
print("Spearman rank correlation:", spearman_corr)
print("Significance level (p-value):", _)

Spearman rank correlation: 0.16618643355253607
Significance level (p-value): 0.0003709795819801635


#### 4.3.3.3. Differences

In [26]:
high_rated_movies_keywords[~high_rated_movies_keywords["Keyword"].isin(low_rated_movies_keywords["Keyword"])]

Unnamed: 0,Keyword,Frequency,Rank
14,universe,4,17
11,event,4,19
13,behind,4,24
70,defend,3,25
77,justice,3,27
...,...,...,...
708,attention,1,1506
706,bride,1,1508
705,woodcutter,1,1509
704,arrange,1,1510


In [27]:
low_rated_movies_keywords[~low_rated_movies_keywords["Keyword"].isin(high_rated_movies_keywords["Keyword"])]

Unnamed: 0,Keyword,Frequency,Rank
10,student,4,12
9,la,4,17
52,another,3,18
56,opposite,3,20
53,comedy,3,23
...,...,...,...
716,organization,1,1556
715,weapon,1,1557
713,kirk,1,1559
712,mass,1,1560
