## Preprocessing and Exploratory Data Analysis

In [129]:
import pandas as pd
import re
from spellchecker import SpellChecker
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/lilia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lilia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Data Information

In [130]:
# Loading the dataset
dataset_path = 'data/train_data.txt'
# data = pd.read_csv(dataset_path, sep=':::', engine='python', header=None)
# data.columns = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']
data = pd.read_csv("train_data.txt", delimiter = ' ::: ', header = None, names = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])

  data = pd.read_csv("train_data.txt", delimiter = ' ::: ', header = None, names = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])


FileNotFoundError: [Errno 2] No such file or directory: 'train_data.txt'

In [None]:
print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))
data.head()

#### Data Types

In [131]:
# Printing data types of DataFrame
data.dtypes

ID                        int64
TITLE                    object
GENRE                    object
DESCRIPTION              object
CLEAN_DESCRIPTION        object
TOKENIZED_DESCRIPTION    object
dtype: object

#### Dataframe Statistics

In [132]:
# Printing description of DataFrame
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,54214.0,27107.5,15650.378084,1.0,13554.25,27107.5,40660.75,54214.0


#           Data Cleaning and Standardization

 ### Removing HTML tags if the data is scraped from the web

In [133]:
def remove_html_tags(text):
    clean_text = re.sub(r'<[^>]+>', '', text)
    return clean_text


 ### Removing special characters

In [134]:
def remove_special_characters(text):
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return clean_text


 ### Removing emojis and  non-standard symbols

In [135]:
# Define a regex pattern to match emojis and non-standard symbols
# This pattern targets characters outside the typical ASCII range, which includes most emojis and non-standard symbols
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251"  # Enclosed characters
                           "]+", flags=re.UNICODE)

# Function to remove emojis and non-standard symbols
def remove_emojis_and_symbols(text):
    return emoji_pattern.sub(r'', text)

# Apply the function to remove emojis and non-standard symbols from DESCRIPTION
data['DESCRIPTION'] = data['DESCRIPTION'].apply(remove_emojis_and_symbols)

###   Converting to Lowercase:

In [136]:
 # Convert DESCRIPTION column to lowercase
data['DESCRIPTION'] = data['DESCRIPTION'].str.lower()


### Fixing Encoding Issues

In [137]:
try:
    data = pd.read_csv('data/train_data.txt', 
                       sep=':::', 
                       engine='python',
                       encoding='utf-8',  # Ensure UTF-8 encoding
                       on_bad_lines='skip',
                       quotechar='"', 
                       names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION']
                      )
    print("Data loaded successfully.")
except Exception as e:
    print(f"An error occurred: {e}")

Data loaded successfully.


###  Remove Extra Whitespace

In [138]:
def remove_extra_whitespace(text):
    words = text.split()
    clean_words = [word.strip() for word in words]
    clean_text = ' '.join(clean_words)
    return clean_text

# Apply the function to the DESCRIPTION column
data['CLEAN_DESCRIPTION'] = data['DESCRIPTION'].apply(remove_extra_whitespace)


In [139]:
# Print sample entries before cleaning
print("Sample entries from DESCRIPTION column before cleaning:")
print(data['DESCRIPTION'].head())

# Print sample entries after cleaning
print("\nSample entries from CLEAN_DESCRIPTION column after cleaning:")
print(data['CLEAN_DESCRIPTION'].head())


Sample entries from DESCRIPTION column before cleaning:
0     Listening in to a conversation between his do...
1     A brother and sister with a past incestuous r...
2     As the bus empties the students for their fie...
3     To help their unemployed father make ends mee...
4     The film's title refers not only to the un-re...
Name: DESCRIPTION, dtype: object

Sample entries from CLEAN_DESCRIPTION column after cleaning:
0    Listening in to a conversation between his doc...
1    A brother and sister with a past incestuous re...
2    As the bus empties the students for their fiel...
3    To help their unemployed father make ends meet...
4    The film's title refers not only to the un-rec...
Name: CLEAN_DESCRIPTION, dtype: object


### Handle Missing Values

In [140]:
# Handle missing values
data.dropna(inplace=True)

# Check for missing values
assert not data.isnull().values.any(), "Missing values are present in the data."

# Print success message
print("Missing values handled successfully.")


Missing values handled successfully.


### Normalization

In [153]:
# Define function to normalize numbers in a column
def normalize_numbers_column(column):
    number_pattern = r'\b\d[\d,.]*\b'
    return column.str.replace(number_pattern, 'NUMBER', regex=True)

### Non-informative words

In [162]:
import nltk
from nltk.corpus import stopwords

# Download the set of stop words the first time
nltk.download('stopwords')

# Load the stop words
non_informative_words = set(stopwords.words('english'))

# Define the function to remove non-informative words
def remove_non_informative(text):
    # Split the text into words and remove non-informative words
    return ' '.join(word for word in text.split() if word.lower() not in non_informative_words)

# Assuming 'df' is your DataFrame and it has a column 'CLEAN_DESCRIPTION'
# Apply the function to remove non-informative words from 'CLEAN_DESCRIPTION'
df['CLEAN_DESCRIPTION'] = df['CLEAN_DESCRIPTION'].apply(remove_non_informative)

# Check again for non-informative words in 'CLEAN_DESCRIPTION'
df['contains_non_informative'] = df['CLEAN_DESCRIPTION'].apply(
    lambda x: any(word.lower() in non_informative_words for word in x.split())
)

# Display the DataFrame to verify the removal
display_df = df[['CLEAN_DESCRIPTION', 'contains_non_informative']]

# Summarize the checks
summary = {
    'Non-informative Words Removed': not df['contains_non_informative'].any()
}

summary


[nltk_data] Downloading package stopwords to /Users/lilia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(                                       CLEAN_DESCRIPTION  \
 0      listening conversation doctor parents 10yearol...   
 1      brother sister past incestuous relationship cu...   
 2      bus empties students field trip museum natural...   
 3      help unemployed father make ends meet edith tw...   
 4      films title refers unrecovered bodies ground z...   
 ...                                                  ...   
 54209  shortlived nbc live sitcom centered bonino wor...   
 54210  next generation exploitation sisters kapa bay ...   
 54211  ze bestaan echt standup comedy growing facing ...   
 54212  walter vivian live country difficult time keep...   
 54213  labor day weekend 1935 intense hurricane ever ...   
 
        contains_non_informative  
 0                         False  
 1                         False  
 2                         False  
 3                         False  
 4                         False  
 ...                         ...  
 54209               

### checking cleanliness

In [163]:
# Get the length of the DESCRIPTION column before removing HTML tags
original_length = len(data['DESCRIPTION'])

# Apply the function to remove HTML tags
data['DESCRIPTION'] = data['DESCRIPTION'].apply(remove_html_tags)
# Get the length of the DESCRIPTION column after removing HTML tags
cleaned_length = len(data['DESCRIPTION'])

# Check if the lengths are different
if original_length != cleaned_length:
    print("HTML tags were present and successfully removed.")
else:
    print("No HTML tags were found in the DESCRIPTION column.")


No HTML tags were found in the DESCRIPTION column.


In [164]:
# Get the length of the DESCRIPTION column before removing HTML tags
original_length = len(data['DESCRIPTION'])

# Apply the function to remove HTML tags
data['DESCRIPTION'] = data['DESCRIPTION'].apply(remove_special_characters)
# Get the length of the DESCRIPTION column after removing HTML tags
cleaned_length = len(data['DESCRIPTION'])

# Check if the lengths are different
if original_length != cleaned_length:
    print("Special characters were present and successfully removed.")
else:
    print("No Special characters were found in the DESCRIPTION column.")


No Special characters were found in the DESCRIPTION column.


In [165]:
# Get the length of the DESCRIPTION column before removing HTML tags
original_length = len(data['DESCRIPTION'])

# Apply the function to remove HTML tags
data['DESCRIPTION'] = data['DESCRIPTION'].apply(remove_emojis_and_symbols
)
# Get the length of the DESCRIPTION column after removing HTML tags
cleaned_length = len(data['DESCRIPTION'])

# Check if the lengths are different
if original_length != cleaned_length:
    print("emojis and symbols were present and successfully removed.")
else:
    print("No emojis and symbols were found in the DESCRIPTION column.")


No emojis and symbols were found in the DESCRIPTION column.


In [166]:
def convert_to_lowercase(text):
    return text.lower()

# Apply the function to the DESCRIPTION column
data['CLEAN_DESCRIPTION'] = data['DESCRIPTION'].apply(convert_to_lowercase)

# Check if all text is converted to lowercase
is_lowercase = (data['DESCRIPTION'].str.lower() == data['CLEAN_DESCRIPTION']).all()

if is_lowercase:
    print("All text in the CLEAN_DESCRIPTION column is in lowercase.")
else:
    print("Not all text in the CLEAN_DESCRIPTION column is in lowercase.")


All text in the CLEAN_DESCRIPTION column is in lowercase.


### Tokenization, Stop Words Removal, and Handling Negations:

In [167]:
# Tokenization, Stop Words Removal, and Handling Negations
def preprocess_text(text):
    
    # Removing non-alphanumeric characters
    # This to make sure that we do not have like (film, 's, instead we will have films)
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    
    # Tokenizing the description
    tokens = word_tokenize(text)

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Handling negations (e.g., "not good" becomes "not_good")
    for i in range(len(filtered_tokens)):
        if filtered_tokens[i] == 'not' and i + 1 < len(filtered_tokens):
            filtered_tokens[i + 1] = 'not_' + filtered_tokens[i + 1]

    return filtered_tokens

# Applying the preprocessing function to the 'DESCRIPTION' column
data['TOKENIZED_DESCRIPTION'] = data['CLEAN_DESCRIPTION'].apply(preprocess_text)



## Summary

In [174]:
# Assuming 'data' is your DataFrame and it has a column 'CLEAN_DESCRIPTION'
df = pd.DataFrame(data)

# 1. Check for HTML tags
html_tag_pattern = re.compile(r'<[^>]+>')
df['contains_html_tags'] = df['CLEAN_DESCRIPTION'].apply(lambda x: bool(html_tag_pattern.search(x)))

# 2. Check for special characters
special_chars_pattern = re.compile(r'[@#$%]')
df['contains_special_chars'] = df['CLEAN_DESCRIPTION'].apply(lambda x: bool(special_chars_pattern.search(x)))

# 3. Check for non-standard symbols or emojis
non_standard_pattern = re.compile(r'[^\w\s,.!?;:\-\(\)\'\"/]')
df['contains_non_standard_symbols'] = df['CLEAN_DESCRIPTION'].apply(lambda x: bool(non_standard_pattern.search(x)))

# 4. Check for lowercase text
df['is_lowercase'] = df['CLEAN_DESCRIPTION'].apply(lambda x: x.islower())

# 5 Apply the function to remove non-informative words from 'CLEAN_DESCRIPTION'
df['CLEAN_DESCRIPTION'] = df['CLEAN_DESCRIPTION'].apply(remove_non_informative)

# Check again for non-informative words in 'CLEAN_DESCRIPTION'
df['contains_non_informative'] = df['CLEAN_DESCRIPTION'].apply(
    lambda x: any(word.lower() in non_informative_words for word in x.split())
)

# 6 check for tokenization
df['is_tokenized'] = df['TOKENIZED_DESCRIPTION'].apply(lambda x: isinstance(x, list))


# Display the DataFrame to verify the removal
display_df = df[['CLEAN_DESCRIPTION', 'contains_non_informative']]

# Now summarize the checks
summary = {
    'HTML Tags': df['contains_html_tags'].any(),
    'Special Characters': df['contains_special_chars'].any(),
    'Non-standard Symbols/Emojis': df['contains_non_standard_symbols'].any(),
    'Lowercase Text': not df['is_lowercase'].all(),  
    'Tokenization Applied': df['is_tokenized'].all(),
}

summary


{'HTML Tags': False,
 'Special Characters': False,
 'Non-standard Symbols/Emojis': False,
 'Lowercase Text': False,
 'Tokenization Applied': True}

In [175]:

df

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION,CLEAN_DESCRIPTION,TOKENIZED_DESCRIPTION,contains_html_tags,contains_special_chars,contains_non_standard_symbols,is_lowercase,contains_non_informative,is_tokenized
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening conversation doctor parents 10yearol...,"[listening, conversation, doctor, parents, 10y...",False,False,False,True,False,True
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...,brother sister past incestuous relationship cu...,"[brother, sister, past, incestuous, relationsh...",False,False,False,True,False,True
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,bus empties students field trip museum natural...,"[bus, empties, students, field, trip, museum, ...",False,False,False,True,False,True
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,help unemployed father make ends meet edith tw...,"[help, unemployed, father, make, ends, meet, e...",False,False,False,True,False,True
4,5,The Unrecovered (2007),drama,The films title refers not only to the unreco...,films title refers unrecovered bodies ground z...,"[films, title, refers, unrecovered, bodies, gr...",False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This shortlived NBC live sitcom centered on B...,shortlived nbc live sitcom centered bonino wor...,"[shortlived, nbc, live, sitcom, centered, boni...",False,False,False,True,False,True
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION The siste...,next generation exploitation sisters kapa bay ...,"[next, generation, exploitation, sisters, kapa...",False,False,False,True,False,True
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,Ze bestaan echt is a standup comedy about gro...,ze bestaan echt standup comedy growing facing ...,"[ze, bestaan, echt, standup, comedy, growing, ...",False,False,False,True,False,True
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...,walter vivian live country difficult time keep...,"[walter, vivian, live, country, difficult, tim...",False,False,False,True,False,True
