In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'genre-classification-dataset-imdb:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1417162%2F2347441%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240219%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240219T110821Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da351be0ae36a67bb2f773d2b621d91e8e0a4c90bbcdf28727cf5fd84d42099f82799331bd12b799227cf7729ed8aef6c13a9b5d3991df4b272f154ee45bb11d12cba330299ed898f5e7e1cb37342728a2fcc204f1a9d2c679d8941c8fd875f0635fa4425bced45c652b02d723bc1b99829e391fb0a8f04b2683b86589eadf3e1439d62de4223b5be3c989e78d89dbfa3ca961bc26e531de5c768b88947785e0b8e2a9d771692a9a258cb899b2d90a7f32dc1fd387a3e5b309e171c03268bcf980ddb381f4050d406d9bff4b99963d19e3b5bda5d9c3aea96b824dc198730a55c3d4881c1815a65d285660564282d9f929f300d13eea2b73162cbf1db3cdb8d60'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading genre-classification-dataset-imdb, 43729690 bytes compressed
Downloaded and uncompressed: genre-classification-dataset-imdb
Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt
/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt
/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/description.txt
/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data_solution.txt


In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt",sep=':::', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
display(df.head())
print(df.shape)

  df = pd.read_csv("/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt",sep=':::', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])


Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


(54214, 4)


In [5]:
'''df_test=pd.read_csv("/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt", sep=':::',names=['ID','TITLE','GENRE','DESCRIPTIONS'])
display(train_data.head())
print(test_data.shape)'''

'df_test=pd.read_csv("/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt", sep=\':::\',names=[\'ID\',\'TITLE\',\'GENRE\',\'DESCRIPTIONS\'])\ndisplay(train_data.head())\nprint(test_data.shape)'

In [6]:
X = df['DESCRIPTION'].values
y = df['GENRE'].values

In [7]:
df[0:1]

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...


In [8]:
df

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


In [9]:
df.isna().sum()

ID             0
TITLE          0
GENRE          0
DESCRIPTION    0
dtype: int64

Data pre processing

#Step 1 converting into lower cases, so that two words with such as Ball and ball will be considered as singel token


In [10]:
df['DESCRIPTION']=df['DESCRIPTION'].str.lower()

In [11]:
df['DESCRIPTION']

0         listening in to a conversation between his do...
1         a brother and sister with a past incestuous r...
2         as the bus empties the students for their fie...
3         to help their unemployed father make ends mee...
4         the film's title refers not only to the un-re...
                               ...                        
54209     this short-lived nbc live sitcom centered on ...
54210     the next generation of exploitation. the sist...
54211     ze bestaan echt, is a stand-up comedy about g...
54212     walter and vivian live in the country and hav...
54213     on labor day weekend, 1935, the most intense ...
Name: DESCRIPTION, Length: 54214, dtype: object

Removing html tags if there are any

In [12]:
import re
def remove_html(text):
    pattern=re.compile('<,*?>')
    return pattern.sub('r',text)

In [13]:
df['DESCRIPTION']=df['DESCRIPTION'].apply(remove_html)

In [14]:
'''
def remove_url(text):
    pattern=re.compile(r'')
'''


"\ndef remove_url(text):\n    pattern=re.compile(r'')\n"

Removing punctuation


In [15]:
import string, time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [16]:
exclude=string.punctuation

In [17]:
'''
def remove_punctuation(text):
    for char in exclude:
        text=text.replace(char,'')
    return text
'''

"\ndef remove_punctuation(text):\n    for char in exclude:\n        text=text.replace(char,'')\n    return text\n"

In [18]:
#df['DESCRIPTION']=df['DESCRIPTION'].apply(remove_punctuation)

The function written above takes longer time to execute so we would use the standard practive for removing punctuation from the text

In [19]:
def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

In [20]:
df['DESCRIPTION']=df['DESCRIPTION'].apply(remove_punc)

In [21]:
df.head()

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,listening in to a conversation between his do...
1,2,Cupid (1997),thriller,a brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,as the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,to help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,the films title refers not only to the unreco...


Removing chatwords

Spelling correction

In [21]:
from textblob import TextBlob

In [22]:
#df['DESCRIPTION']=df['DESCRIPTION'].apply(lambda x: str(TextBlob(x).correct()))
#time consuming so we will skip for now

Removing stopwords

In [22]:
from nltk.corpus import stopwords

In [23]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [24]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [26]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    new_text = [word for word in text.split() if word not in stop_words]
    return ' '.join(new_text)


In [27]:
df['DESCRIPTION']=df['DESCRIPTION'].apply(remove_stopwords)

In [28]:
df['DESCRIPTION']

0        listening conversation doctor parents 10yearol...
1        brother sister past incestuous relationship cu...
2        bus empties students field trip museum natural...
3        help unemployed father make ends meet edith tw...
4        films title refers unrecovered bodies ground z...
                               ...                        
54209    shortlived nbc live sitcom centered bonino wor...
54210    next generation exploitation sisters kapa bay ...
54211    ze bestaan echt standup comedy growing facing ...
54212    walter vivian live country difficult time keep...
54213    labor day weekend 1935 intense hurricane ever ...
Name: DESCRIPTION, Length: 54214, dtype: object

Removing stopwords using NLTK


In [None]:
#from nltk.tokenize import word_tokenize,sent_tokenize

In [None]:
#df['DESCRIPTION']=df['DESCRIPTION'].apply(word_tokenize)

Using spacy for tokeniziation as it will give the best results

In [29]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [30]:
df['DESCRIPTION']=df['DESCRIPTION'].apply(lambda x:[token.text for token in nlp(x)])

In [31]:
df['DESCRIPTION']

0        [listening, conversation, doctor, parents, 10y...
1        [brother, sister, past, incestuous, relationsh...
2        [bus, empties, students, field, trip, museum, ...
3        [help, unemployed, father, make, ends, meet, e...
4        [films, title, refers, unrecovered, bodies, gr...
                               ...                        
54209    [shortlived, nbc, live, sitcom, centered, boni...
54210    [next, generation, exploitation, sisters, kapa...
54211    [ze, bestaan, echt, standup, comedy, growing, ...
54212    [walter, vivian, live, country, difficult, tim...
54213    [labor, day, weekend, 1935, intense, hurricane...
Name: DESCRIPTION, Length: 54214, dtype: object

Stemming and lementization

In [32]:
from nltk.stem.porter import PorterStemmer

In [37]:
ps=PorterStemmer()
def steam_words(text):
    return " ".join([ps.stem(word) for word in text])

In [38]:
df['StemDescription']=df['DESCRIPTION'].apply(steam_words)

0        listen convers doctor parent 10yearold oscar l...
1        brother sister past incestu relationship curre...
2        bu empti student field trip museum natur histo...
3        help unemploy father make end meet edith twin ...
4        film titl refer unrecov bodi ground zero also ...
                               ...                        
54209    shortliv nbc live sitcom center bonino worldfa...
54210    next gener exploit sister kapa bay soror hous ...
54211    ze bestaan echt standup comedi grow face fear ...
54212    walter vivian live countri difficult time keep...
54213    labor day weekend 1935 intens hurrican ever ma...
Name: DESCRIPTION, Length: 54214, dtype: object

In [43]:
sample='walk','walks','walking','walked'

In [46]:
steam_words(sample)

'walk walk walk walk'

In [48]:
df['DESCRIPTION'].head()

0    [listening, conversation, doctor, parents, 10y...
1    [brother, sister, past, incestuous, relationsh...
2    [bus, empties, students, field, trip, museum, ...
3    [help, unemployed, father, make, ends, meet, e...
4    [films, title, refers, unrecovered, bodies, gr...
Name: DESCRIPTION, dtype: object

Lemmatization
A better form of semmetization where the root word is a english word present in the english dictionary

In [49]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


In [50]:
def lemmatize_text(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Join the lemmatized words back into a string
    lemmatized_text = ' '.join(lemmatized_words)

    return lemmatized_text


In [54]:
nltk.download('wordnet')
nltk.download('omw-1.4')
import nltk
nltk.download('punkt')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [56]:
df['lemmatized_Description'] = df['DESCRIPTION'].apply(lemmatize_text)


TypeError: expected string or bytes-like object