## Preprocessing and Exploratory Data Analysis

In [3]:
import pandas as pd
import re
from spellchecker import SpellChecker
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/lilia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lilia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Data Information

In [5]:
# Loading the dataset
dataset_path = 'data/train_data.txt'
data = pd.read_csv(dataset_path, sep=':::', engine='python', header=None)
data.columns = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']

In [6]:
print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))
data.head()

Number of instances = 54214
Number of attributes = 4


Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


#### Data Types

In [7]:
# Printing data types of DataFrame
data.dtypes

ID              int64
TITLE          object
GENRE          object
DESCRIPTION    object
dtype: object

#### Dataframe Statistics

In [8]:
# Printing description of DataFrame
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,54214.0,27107.5,15650.378084,1.0,13554.25,27107.5,40660.75,54214.0


#           Data Cleaning and Standardization

 ### Removing HTML tags if the data is scraped from the web

In [9]:
def remove_html_tags(text):
    clean_text = re.sub(r'<[^>]+>', '', text)
    return clean_text


 ### Removing special characters

In [10]:
def remove_special_characters(text):
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return clean_text


 ### Removing emojis and  non-standard symbols

In [11]:
# Define a regex pattern to match emojis and non-standard symbols
# This pattern targets characters outside the typical ASCII range, which includes most emojis and non-standard symbols
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251"  # Enclosed characters
                           "]+", flags=re.UNICODE)

# Function to remove emojis and non-standard symbols
def remove_emojis_and_symbols(text):
    return emoji_pattern.sub(r'', text)

# Apply the function to remove emojis and non-standard symbols from DESCRIPTION
data['DESCRIPTION'] = data['DESCRIPTION'].apply(remove_emojis_and_symbols)

###   Converting to Lowercase:

In [12]:
 # Convert DESCRIPTION column to lowercase
data['DESCRIPTION'] = data['DESCRIPTION'].str.lower()


### Fixing Encoding Issues

In [13]:
try:
    data = pd.read_csv('data/train_data.txt', 
                       sep=':::', 
                       engine='python',
                       encoding='utf-8',  # Ensure UTF-8 encoding
                       on_bad_lines='skip',
                       quotechar='"', 
                       names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION']
                      )
    print("Data loaded successfully.")
except Exception as e:
    print(f"An error occurred: {e}")

Data loaded successfully.


###  Remove Extra Whitespace

In [14]:
def remove_extra_whitespace(text):
    words = text.split()
    clean_words = [word.strip() for word in words]
    clean_text = ' '.join(clean_words)
    return clean_text

# Apply the function to the DESCRIPTION column
data['CLEAN_DESCRIPTION'] = data['DESCRIPTION'].apply(remove_extra_whitespace)


In [15]:
# Print sample entries before cleaning
print("Sample entries from DESCRIPTION column before cleaning:")
print(data['DESCRIPTION'].head())

# Print sample entries after cleaning
print("\nSample entries from CLEAN_DESCRIPTION column after cleaning:")
print(data['CLEAN_DESCRIPTION'].head())


Sample entries from DESCRIPTION column before cleaning:
0     Listening in to a conversation between his do...
1     A brother and sister with a past incestuous r...
2     As the bus empties the students for their fie...
3     To help their unemployed father make ends mee...
4     The film's title refers not only to the un-re...
Name: DESCRIPTION, dtype: object

Sample entries from CLEAN_DESCRIPTION column after cleaning:
0    Listening in to a conversation between his doc...
1    A brother and sister with a past incestuous re...
2    As the bus empties the students for their fiel...
3    To help their unemployed father make ends meet...
4    The film's title refers not only to the un-rec...
Name: CLEAN_DESCRIPTION, dtype: object


### Handle Missing Values

In [16]:
# Handle missing values
data.dropna(inplace=True)

# Check for missing values
assert not data.isnull().values.any(), "Missing values are present in the data."

# Print success message
print("Missing values handled successfully.")


Missing values handled successfully.


### Normalization

In [17]:
# Define function to normalize numbers in a column
def normalize_numbers_column(column):
    number_pattern = r'\b\d[\d,.]*\b'
    return column.str.replace(number_pattern, 'NUMBER', regex=True)

### checking cleanliness

In [18]:
# Get the length of the DESCRIPTION column before removing HTML tags
original_length = len(data['DESCRIPTION'])

# Apply the function to remove HTML tags
data['DESCRIPTION'] = data['DESCRIPTION'].apply(remove_html_tags)
# Get the length of the DESCRIPTION column after removing HTML tags
cleaned_length = len(data['DESCRIPTION'])

# Check if the lengths are different
if original_length != cleaned_length:
    print("HTML tags were present and successfully removed.")
else:
    print("No HTML tags were found in the DESCRIPTION column.")


No HTML tags were found in the DESCRIPTION column.


In [19]:
# Get the length of the DESCRIPTION column before removing HTML tags
original_length = len(data['DESCRIPTION'])

# Apply the function to remove HTML tags
data['DESCRIPTION'] = data['DESCRIPTION'].apply(remove_special_characters)
# Get the length of the DESCRIPTION column after removing HTML tags
cleaned_length = len(data['DESCRIPTION'])

# Check if the lengths are different
if original_length != cleaned_length:
    print("Special characters were present and successfully removed.")
else:
    print("No Special characters were found in the DESCRIPTION column.")


No Special characters were found in the DESCRIPTION column.


In [20]:
# Get the length of the DESCRIPTION column before removing HTML tags
original_length = len(data['DESCRIPTION'])

# Apply the function to remove HTML tags
data['DESCRIPTION'] = data['DESCRIPTION'].apply(remove_emojis_and_symbols
)
# Get the length of the DESCRIPTION column after removing HTML tags
cleaned_length = len(data['DESCRIPTION'])

# Check if the lengths are different
if original_length != cleaned_length:
    print("emojis and symbols were present and successfully removed.")
else:
    print("No emojis and symbols were found in the DESCRIPTION column.")


No emojis and symbols were found in the DESCRIPTION column.


In [21]:
def convert_to_lowercase(text):
    return text.lower()

# Apply the function to the DESCRIPTION column
data['CLEAN_DESCRIPTION'] = data['DESCRIPTION'].apply(convert_to_lowercase)

# Check if all text is converted to lowercase
is_lowercase = (data['DESCRIPTION'].str.lower() == data['CLEAN_DESCRIPTION']).all()

if is_lowercase:
    print("All text in the CLEAN_DESCRIPTION column is in lowercase.")
else:
    print("Not all text in the CLEAN_DESCRIPTION column is in lowercase.")


All text in the CLEAN_DESCRIPTION column is in lowercase.


### Tokenization, Stop Words Removal, and Handling Negations:

In [22]:
# Tokenization, Stop Words Removal, and Handling Negations
def preprocess_text(text):
    
    # Removing non-alphanumeric characters
    # This to make sure that we do not have like (film, 's, instead we will have films)
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    
    # Tokenizing the description
    tokens = word_tokenize(text)

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Handling negations (e.g., "not good" becomes "not_good")
    for i in range(len(filtered_tokens)):
        if filtered_tokens[i] == 'not' and i + 1 < len(filtered_tokens):
            filtered_tokens[i + 1] = 'not_' + filtered_tokens[i + 1]

    return filtered_tokens

# Applying the preprocessing function to the 'DESCRIPTION' column
data['TOKENIZED_DESCRIPTION'] = data['CLEAN_DESCRIPTION'].apply(preprocess_text)

# Displaying the processed DataFrame
print(data[['ID', 'TITLE', 'GENRE', 'TOKENIZED_DESCRIPTION']])

          ID                                         TITLE          GENRE  \
0          1                 Oscar et la dame rose (2009)          drama    
1          2                                 Cupid (1997)       thriller    
2          3             Young, Wild and Wonderful (1980)          adult    
3          4                        The Secret Sin (1915)          drama    
4          5                       The Unrecovered (2007)          drama    
...      ...                                           ...            ...   
54209  54210                              "Bonino" (1953)         comedy    
54210  54211                  Dead Girls Don't Cry (????)         horror    
54211  54212    Ronald Goedemondt: Ze bestaan echt (2008)    documentary    
54212  54213                     Make Your Own Bed (1944)         comedy    
54213  54214   Nature's Fury: Storm of the Century (2006)        history    

                                   TOKENIZED_DESCRIPTION  
0      [listenin

In [54]:
# Function to check if non-informative words are present in the dataset
def check_cleanliness(dataset):
    non_informative_words = ['the', 'and', 'is', 'are', 'in', 'of', 'to', 'a', 'an', 'as', 'at', 'on', 'by', 'with', 'for']
    for index, row in dataset.iterrows():
        description = row['DESCRIPTION']
        words = description.split()
        informative_words = [word.lower() for word in words if word.lower() not in non_informative_words]
        if not informative_words:
            return False
    return True

# Check the cleanliness of the dataset
clean_dataset = check_cleanliness(data)

# If the dataset is not clean, remove rows containing non-informative words
if not clean_dataset:
    data = data[data['DESCRIPTION'].apply(lambda x: not all(word.lower() in non_informative_words for word in x.split()))]

# Print whether the dataset is clean or not
print("Is the dataset clean?", check_cleanliness(data))


Is the dataset clean? True


In [55]:
data

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION,CLEAN_DESCRIPTION,TOKENIZED_DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,listening in to a conversation between his doc...,listening in to a conversation between his doc...,"[listening, conversation, doctor, parents, 10y..."
1,2,Cupid (1997),thriller,a brother and sister with a past incestuous re...,a brother and sister with a past incestuous re...,"[brother, sister, past, incestuous, relationsh..."
2,3,"Young, Wild and Wonderful (1980)",adult,as the bus empties the students for their fiel...,as the bus empties the students for their fiel...,"[bus, empties, students, field, trip, museum, ..."
3,4,The Secret Sin (1915),drama,to help their unemployed father make ends meet...,to help their unemployed father make ends meet...,"[help, unemployed, father, make, ends, meet, e..."
4,5,The Unrecovered (2007),drama,the films title refers not only to the unrecov...,the films title refers not only to the unrecov...,"[films, title, refers, unrecovered, bodies, gr..."
...,...,...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,this shortlived nbc live sitcom centered on bo...,this shortlived nbc live sitcom centered on bo...,"[shortlived, nbc, live, sitcom, centered, boni..."
54210,54211,Dead Girls Don't Cry (????),horror,the next generation of exploitation the sister...,the next generation of exploitation the sister...,"[next, generation, exploitation, sisters, kapa..."
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,ze bestaan echt is a standup comedy about grow...,ze bestaan echt is a standup comedy about grow...,"[ze, bestaan, echt, standup, comedy, growing, ..."
54212,54213,Make Your Own Bed (1944),comedy,walter and vivian live in the country and have...,walter and vivian live in the country and have...,"[walter, vivian, live, country, difficult, tim..."
