## IMPORT & READ CSV

In [2]:
import pandas as pd

content = pd.read_csv('./src/TMDB_content.csv')
content = content.drop(['poster_path', 'year', 'watch_providers'], axis=1)

## PREPROCESSINGS

### 1. KEYWORDS
1. Remove occurence < 1

2. Transform keyword to its lemma form

3. Lowercase and remove spaces

In [4]:
df = content.copy()
df['keywords'] = df['keywords'].str.split(',')
df = df.explode('keywords')

In [5]:
k = df['keywords'].value_counts()
k

keywords
 based on novel or book    332
 duringcreditsstinger      240
 murder                    202
 sequel                    196
 california                171
                          ... 
 minefield                   1
 corsican                    1
 twilight                    1
 streetwise                  1
 free fall                   1
Name: count, Length: 10589, dtype: int64

In [6]:
k = k[k>1]
k

keywords
 based on novel or book    332
 duringcreditsstinger      240
 murder                    202
 sequel                    196
 california                171
                          ... 
 around the world            2
 canoe                       2
 candle                      2
secret identity              2
 covid-19                    2
Name: count, Length: 4995, dtype: int64

In [7]:
def filter_keywords(x):
    ''' Remove keywords that appear only one time '''
    words = []
    for i in x:
        if i in k:
            words.append(i)
    return words

In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')

def to_lemma(text):
    ''' Transform keyword into its lemma form '''
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [9]:
content['keywords'] = content['keywords'].apply(lambda x: str(x).split(','))
content['keywords'] = content['keywords'].apply(filter_keywords)
content['keywords'] = content['keywords'].apply(lambda x: [to_lemma(i) for i in x])
content['keywords'] = content['keywords'].apply(lambda x: [str.lower(i.replace(' ','')) for i in x])

In [10]:
k_clean = content.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
k_clean.name = 'keyword'
k_clean.value_counts()

keyword
baseonnovelorbook       496
duringcreditsstinger    244
sequel                  209
murder                  202
newyorkcity             201
                       ... 
universe                  2
earth                     2
warstrategy               2
slowmotion                2
platoniclove              2
Name: count, Length: 4649, dtype: int64

Résults : 
1. keywords are replaced by its lemma (ex. 'base' VS 'based on novel')

2. keywords that appear only once are removed

### 2. GENRES, CAST, DIRECTOR

In [11]:
content['genres'] = content['genres'].apply(lambda x: str(x).split(','))
content['genres'] = content['genres'].apply(lambda x: [str.lower(i.replace(' ','')) for i in x])

content['cast'] = content['cast'].apply(lambda x: str(x).split(','))
content['cast'] = content['cast'].apply(lambda x: [str.lower(i.replace(' ','')) for i in x])

content['director'] = content['director'].apply(lambda x: [x,x,x])
content['director'] = content['director'].apply(lambda x: [str.lower(i.replace(' ','')) for i in x])

In [12]:
# We keep 3 actors as the main cast:
if len(content['cast']) > 3:
    content['main_cast'] = content['cast'].apply(lambda x: x[0:3])
else:
    content['main_cast'] = content['cast']

## GATHER AND SAVE DATA

In [13]:
content['soup'] = content['genres'] + content['keywords'] + content['main_cast'] + content['director']
content['soup'] = content['soup'].apply(lambda x: ' '.join(x))

In [17]:
preprocessed_content = content.drop(['title', 'genres', 'keywords', 'cast', 'director', 'main_cast'], axis=1)
preprocessed_content.to_csv('./src/preprocessed_content_3_3.csv')