In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import os
from sklearn.model_selection import train_test_split

In [2]:
# Detect if the notebook run on Kaggle or Localhost.
if os.environ.get('KAGGLE_KERNEL_RUN_TYPE',''):
    path = '../input/imdb-movies-genre-classification/Movies_Genre_Description.csv'
else:
    path = 'Movies_Genre_Description.csv'    

In [3]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,TITLE,GENRE,DESCRIPTION,DATE
0,Oscar et la dame rose,Oscar et la dame rose,Listening in to a conversation between his doc...,2009
1,Cupid,thriller,A brother and sister with a past incestuous re...,1997
2,"Young, Wild and Wonderful",adult,As the bus empties the students for their fiel...,1980
3,The Secret Sin,drama,To help their unemployed father make ends meet...,1915
4,The Unrecovered,drama,The film's title refers not only to the un-rec...,2007


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108414 entries, 0 to 108413
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   TITLE        108414 non-null  object
 1   GENRE        108414 non-null  object
 2   DESCRIPTION  108414 non-null  object
 3   DATE         108414 non-null  object
dtypes: object(4)
memory usage: 3.3+ MB


In [5]:
df.rename(columns={'TITLE': 'title','DESCRIPTION': 'desc', 'GENRE': 'genre', 'DATE':'date'}, inplace = True)
df.head()

Unnamed: 0,title,genre,desc,date
0,Oscar et la dame rose,Oscar et la dame rose,Listening in to a conversation between his doc...,2009
1,Cupid,thriller,A brother and sister with a past incestuous re...,1997
2,"Young, Wild and Wonderful",adult,As the bus empties the students for their fiel...,1980
3,The Secret Sin,drama,To help their unemployed father make ends meet...,1915
4,The Unrecovered,drama,The film's title refers not only to the un-rec...,2007


# Discover target value (GENRE):

In [6]:
print('Genre counts:', df['genre'].nunique())
print('Genre types percentages:\n:', df['genre'].value_counts()*100/df.shape[0])

Genre counts: 28
Genre types percentages:
: drama                    25.111148
documentary              24.159241
comedy                   13.737156
short                     9.357648
horror                    4.065896
thriller                  2.934123
action                    2.424964
western                   1.903813
reality-tv                1.629863
family                    1.445385
adventure                 1.429705
music                     1.348534
romance                   1.239692
sci-fi                    1.192650
adult                     1.088420
crime                     0.931614
animation                 0.918701
sport                     0.796023
talk-show                 0.721309
fantasy                   0.594942
mystery                   0.587562
musical                   0.510082
biography                 0.487944
history                   0.448282
game-show                 0.356965
news                      0.333905
war                       0.243511
Oscar et la

We have differnet 28 genre type, and (drama, documnetary, comedy) genres are the major with about 62% percent of our dataset, so we need to be careful when we make train_test_split, we need to use startified sampling, and also 'Oscar et la dame rose' Genre it only has a 1 record so we will drop it.

In [7]:
df.drop(df.loc[(df['genre'] == 'Oscar et la dame rose')].index, inplace = True)

In [8]:
df['genre'].value_counts()

drama          27224
documentary    26192
comedy         14893
short          10145
horror          4408
thriller        3181
action          2629
western         2064
reality-tv      1767
family          1567
adventure       1550
music           1462
romance         1344
sci-fi          1293
adult           1180
crime           1010
animation        996
sport            863
talk-show        782
fantasy          645
mystery          637
musical          553
biography        529
history          486
game-show        387
news             362
war              264
Name: genre, dtype: int64

# Split data to train and test:

In [10]:
X = df[['title','desc']].copy()
y = df['genre'].copy()
train_X, train_y, test_X, test_y = train_test_split(X, y, test_size = 0.2, stratify=y)

# Text Preprocessing:

## Remove stop words:

In [12]:
train_X.reset_index(drop=True, inplace = True)
train_X.head()

Unnamed: 0,title,desc
0,Wau: Soaring the Skies in Rich Tradition,"The Malaysian ""Wau"" or kites can be found in a..."
1,The Girls of Alden,Eleven-year old Emily Alden is homeschooling b...
2,Jordan: You Don't Even Know Me,Having spent over two years with the British s...
3,Keine Kanaken an Bord,Frank S. awakens to find himself on a tour bus...
4,The Third Clue,"Set in an Old Dark House, a man is murdered in..."


In [13]:
en_stopwords = stopwords.words('english')
train_X['no_sw_desc'] = train_X['desc'].apply(lambda t: ' '.join([w for w in t.split() if w not in en_stopwords]))

In [14]:
train_X.head()

Unnamed: 0,title,desc,no_sw_desc
0,Wau: Soaring the Skies in Rich Tradition,"The Malaysian ""Wau"" or kites can be found in a...","The Malaysian ""Wau"" kites found shapes colours..."
1,The Girls of Alden,Eleven-year old Emily Alden is homeschooling b...,Eleven-year old Emily Alden homeschooling fana...
2,Jordan: You Don't Even Know Me,Having spent over two years with the British s...,"Having spent two years British supermodel, Jor..."
3,Keine Kanaken an Bord,Frank S. awakens to find himself on a tour bus...,Frank S. awakens find tour bus idea got headed...
4,The Third Clue,"Set in an Old Dark House, a man is murdered in...","Set Old Dark House, man murdered study. With d..."


In [15]:
train_X['no_punc_sw_desc'] = train_X['no_sw_desc'].apply(lambda t: re.sub(r"([^\w\s])", "", t)) 

## Stemming vs. Lemmatization:
I favorite lemmatization more than stemming, but I will use stemming now because our data is big.

In [16]:
ps = PorterStemmer()
train_X['stemmed_no_punc_sw_desc'] = train_X['no_punc_sw_desc'].apply(lambda t: ' '.join([ps.stem(w) for w in t.split()]))

In [18]:
print('Original:\n', train_X['no_punc_sw_desc'][0])
print('\n-------------\nStemmed:\n', train_X['stemmed_no_punc_sw_desc'][0])

Original:
 The Malaysian Wau kites found shapes colours bamboo frames brightly decorated intricate floral cutouts designs pasted shimmering paper multitude vibrant colours A bowshaped device attached neck emanates pleasant highpitched humming sound soars skies enhances eyecatching wau Waumaking ancient art form one traditionally passed father son Traditionally wau flying especially popular east coast states Peninsular Malaysia namely Kelantan Terengganu commonly flown end harvesting season Today cultural heritage Malaysian Wau preserved become popular sporting activity amongst Malay community Come join us Enigmatic Malaysia follow life three Wau makers prepare International Kite Festival Tumpat Kelantan

-------------
Stemmed:
 the malaysian wau kite found shape colour bamboo frame brightli decor intric floral cutout design past shimmer paper multitud vibrant colour a bowshap devic attach neck eman pleasant highpitch hum sound soar sky enhanc eyecatch wau waumak ancient art form one tr