# Multi-label processing

As to the samples to be categorized there may be more than one label obtained. So for each sample, the label should be a binary vector indicating whether each possible category exists or not.

In [1]:
import pandas as pd

In [2]:
file_path = "original/netflix_titles.csv"
original_data = pd.read_csv(file_path)

selected_columns = ['show_id', 'title', 'rating', 'listed_in', 'description']
filtered_data = original_data[selected_columns]
# rename the colume
filtered_data.columns = ['id', 'title', 'rating', 'category', 'description']

print(filtered_data.head())

   id                  title rating  \
0  s1   Dick Johnson Is Dead  PG-13   
1  s2          Blood & Water  TV-MA   
2  s3              Ganglands  TV-MA   
3  s4  Jailbirds New Orleans  TV-MA   
4  s5           Kota Factory  TV-MA   

                                            category  \
0                                      Documentaries   
1    International TV Shows, TV Dramas, TV Mysteries   
2  Crime TV Shows, International TV Shows, TV Act...   
3                             Docuseries, Reality TV   
4  International TV Shows, Romantic TV Shows, TV ...   

                                         description  
0  As her father nears the end of his life, filmm...  
1  After crossing paths at a party, a Cape Town t...  
2  To protect his family from a powerful drug lor...  
3  Feuds, flirtations and toilet talk go down amo...  
4  In a city of coaching centers known to train I...  


In [3]:
filtered_data['category'] = filtered_data['category'].apply(lambda x: [label.strip() for label in x.split(',')])

all_labels = set(label for labels in filtered_data['category'] for label in labels)

print("All label：", all_labels)

for label in all_labels:
    filtered_data[label] = filtered_data['category'].apply(lambda x: 1 if label in x else 0)

filtered_data.drop('category', axis=1, inplace=True)

# print(filtered_data.head())


All label： {'Independent Movies', 'Anime Features', 'TV Dramas', 'Stand-Up Comedy', 'Romantic Movies', 'Anime Series', 'Action & Adventure', 'Movies', 'International Movies', 'Stand-Up Comedy & Talk Shows', 'Sports Movies', 'LGBTQ Movies', 'TV Horror', 'TV Thrillers', 'Reality TV', 'Sci-Fi & Fantasy', 'Crime TV Shows', 'Docuseries', 'TV Mysteries', 'British TV Shows', 'Dramas', "Kids' TV", 'Korean TV Shows', 'Romantic TV Shows', 'TV Comedies', 'Teen TV Shows', 'Cult Movies', 'Science & Nature TV', 'Documentaries', 'Music & Musicals', 'Thrillers', 'TV Sci-Fi & Fantasy', 'International TV Shows', 'TV Shows', 'TV Action & Adventure', 'Horror Movies', 'Spanish-Language TV Shows', 'Faith & Spirituality', 'Classic & Cult TV', 'Comedies', 'Classic Movies', 'Children & Family Movies'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['category'] = filtered_data['category'].apply(lambda x: [label.strip() for label in x.split(',')])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data[label] = filtered_data['category'].apply(lambda x: 1 if label in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fil

In [4]:
output_path = "preprocessed/netflix.csv"
filtered_data.to_csv(output_path, index=False) # save

# Text preprocessing for 'description'

In [5]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [6]:
def preprocess_text(text):

    text = text.lower()
    
    text = re.sub(r'[^\w\s]', '', text)
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)

In [7]:
file_path = "preprocessed/netflix.csv"
preprocessed_data = pd.read_csv(file_path)

preprocessed_data['description'] = preprocessed_data['description'].apply(preprocess_text)

In [8]:
output_path = "preprocessed/netflix_processed.csv"
preprocessed_data.to_csv(output_path, index=False) # save

print(preprocessed_data.head())

   id                  title rating  \
0  s1   Dick Johnson Is Dead  PG-13   
1  s2          Blood & Water  TV-MA   
2  s3              Ganglands  TV-MA   
3  s4  Jailbirds New Orleans  TV-MA   
4  s5           Kota Factory  TV-MA   

                                         description  Independent Movies  \
0  father near end life filmmak kirsten johnson s...                   0   
1  cross path parti cape town teen set prove whet...                   0   
2  protect famili power drug lord skill thief meh...                   0   
3  feud flirtat toilet talk go among incarcer wom...                   0   
4  citi coach center known train india finest col...                   0   

   Anime Features  TV Dramas  Stand-Up Comedy  Romantic Movies  Anime Series  \
0               0          0                0                0             0   
1               0          1                0                0             0   
2               0          0                0                0      

# Dataset split

Split ratio: 7: 1.5: 1.5

In [9]:
from sklearn.model_selection import train_test_split

file_path = "preprocessed/netflix_processed.csv"
preprocessed_data = pd.read_csv(file_path)

In [10]:
shuffled_data = preprocessed_data.sample(frac=1, random_state=42)

train_data, temp_data = train_test_split(shuffled_data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [11]:
train_path = "preprocessed/netflix_train.csv"
val_path = "preprocessed/netflix_val.csv"
test_path = "preprocessed/netflix_test.csv"

train_data.to_csv(train_path, index=False)
val_data.to_csv(val_path, index=False)
test_data.to_csv(test_path, index=False)

print(f"train sample：{len(train_data)}")
print(f"val sample：{len(val_data)}")
print(f"test sample：{len(test_data)}")

train sample：6164
val sample：1321
test sample：1322
