## Preprocessing and Exploratory Data Analysis

In [146]:
import pandas as pd
import re
from spellchecker import SpellChecker

### Data Information

In [159]:
# Loading the dataset
dataset_path = 'data/train_data.txt'
data = pd.read_csv(dataset_path, sep=':::', engine='python', header=None)
data.columns = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']

In [160]:
print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))
data.head()

Number of instances = 54214
Number of attributes = 4


Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


#### Data Types

In [161]:
# Printing data types of DataFrame
data.dtypes

ID              int64
TITLE          object
GENRE          object
DESCRIPTION    object
dtype: object

#### Dataframe Statistics

In [162]:
# Printing description of DataFrame
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,54214.0,27107.5,15650.378084,1.0,13554.25,27107.5,40660.75,54214.0


#           Data Cleaning and Standardization

 ### Cleaning Special Characters

 Removing HTML tags if the data is scraped from the web

In [163]:
# Regular expression pattern to detect HTML tags
html_pattern = re.compile(r'<[^>]+>')

# Function to find HTML tags in a text
def find_html_tags(text):
    return html_pattern.findall(text)

# Apply the function to the DESCRIPTION column and create a boolean mask where HTML tags are found
data['contains_html'] = data['DESCRIPTION'].apply(lambda x: bool(find_html_tags(x)))

# Filter the DataFrame to remove rows where HTML tags were found
data = data[~data['contains_html']]

# Check if there are any HTML tags left in the dataset
html_tags_present = data['DESCRIPTION'].str.contains('<[^>]+>', regex=True).any()

# Print the result
if html_tags_present:
    print("HTML tags are still present in the dataset.")
else:
    print("No HTML tags found in the dataset.")


No HTML tags found in the dataset.


In [164]:

# Regular expression pattern to detect special characters
special_char_pattern = re.compile(r'[^\w\s]')

# Function to find special characters in a text
def find_special_characters(text):
    return special_char_pattern.findall(text)

# Apply the function to the DESCRIPTION column and create a boolean mask where special characters are found
data['contains_special_chars'] = data['DESCRIPTION'].apply(lambda x: bool(find_special_characters(x)))

# Filter the DataFrame to remove rows where special characters were found
data = data[~data['contains_special_chars']]

# Check if there are any special characters left in the dataset
special_chars_present = data['DESCRIPTION'].str.contains(special_char_pattern).any()

# Print the result
if special_chars_present:
    print("Special characters are still present in the dataset.")
else:
    print("No special characters found in the dataset.")

No special characters found in the dataset.


In [165]:
# Define a regex pattern to match emojis and non-standard symbols
# This pattern targets characters outside the typical ASCII range, which includes most emojis and non-standard symbols
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251"  # Enclosed characters
                           "]+", flags=re.UNICODE)

# Function to remove emojis and non-standard symbols
def remove_emojis_and_symbols(text):
    return emoji_pattern.sub(r'', text)

# Apply the function to remove emojis and non-standard symbols from DESCRIPTION
data['DESCRIPTION'] = data['DESCRIPTION'].apply(remove_emojis_and_symbols)

In [166]:
contains_emojis_or_symbols = data['DESCRIPTION'].apply(lambda x: bool(emoji_pattern.search(x)))

if contains_emojis_or_symbols.any():
    print("Emojis or non-standard symbols found in some entries.")
    # You can print a sample or save to a CSV file for detailed review
    print(data[contains_emojis_or_symbols]['DESCRIPTION'].sample(10))
else:
    print("No emojis or non-standard symbols found in the dataset.")

No emojis or non-standard symbols found in the dataset.


###   Converting to Lowercase:

In [167]:
# Function to convert text to lowercase
def convert_to_lowercase(text):
    return text.lower()

# Apply the function to the TITLE and DESCRIPTION columns
data['TITLE'] = data['TITLE'].apply(convert_to_lowercase)
data['DESCRIPTION'] = data['DESCRIPTION'].apply(convert_to_lowercase)

# Verification
title_lowercase_check = (data['TITLE'] == data['TITLE'].str.lower()).all()
description_lowercase_check = (data['DESCRIPTION'] == data['DESCRIPTION'].str.lower()).all()

if title_lowercase_check and description_lowercase_check:
    print("All text in TITLE and DESCRIPTION has been successfully converted to lowercase.")
else:
    print("Some text in TITLE and/or DESCRIPTION has not been converted to lowercase.")

All text in TITLE and DESCRIPTION has been successfully converted to lowercase.


### Fixing Encoding Issues

In [168]:
try:
    data = pd.read_csv('data/train_data.txt', 
                       sep=':::', 
                       engine='python',
                       encoding='utf-8',  # Ensure UTF-8 encoding
                       on_bad_lines='skip',
                       quotechar='"', 
                       names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION']
                      )
    print("Data loaded successfully.")
except Exception as e:
    print(f"An error occurred: {e}")

Data loaded successfully.


###  Remove Extra Whitespace

In [169]:
# Function to remove extra white spaces from text
def remove_extra_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

# Apply the function to the DESCRIPTION column
data['DESCRIPTION'] = data['DESCRIPTION'].apply(remove_extra_whitespace)

# Check if there are any rows with extra white spaces
extra_whitespace_present = data['DESCRIPTION'].str.contains(r'\s{2,}').any()

# Print the result
if extra_whitespace_present:
    print("Extra white spaces are still present in the dataset.")
else:
    print("No extra white spaces found in the dataset.")

No extra white spaces found in the dataset.


### Handle Missing Values

In [170]:
# Handle missing values
data.dropna(inplace=True)

# Check for missing values
assert not data.isnull().values.any(), "Missing values are present in the data."

# Print success message
print("Missing values handled successfully.")


Missing values handled successfully.


### Remove Non-informative Text

In [171]:
# Define a function to filter non-informative text
def filter_non_informative_text(text, min_length=10):
    """
    Filter out non-informative text based on text length.
    
    Parameters:
        text (str): The text to be evaluated.
        min_length (int): Minimum length of informative text. Default is 10.
        
    Returns:
        bool: True if the text is informative, False otherwise.
    """
    return len(text) >= min_length

# Apply the filter to the DESCRIPTION column
data = data[data['DESCRIPTION'].apply(filter_non_informative_text)]

# Print success message
print("Non-informative text removed successfully.")


Non-informative text removed successfully.
