# Multi-label processing

As to the samples to be categorized there may be more than one label obtained. So for each sample, the label should be a binary vector indicating whether each possible category exists or not.

In [1]:
import pandas as pd

In [2]:
file_path = "original/netflix_titles.csv"
original_data = pd.read_csv(file_path)

# Select only the columns we need
selected_columns = ['show_id', 'title', 'rating', 'listed_in', 'description']
filtered_data = original_data[selected_columns]

# rename the colume for later use
filtered_data.columns = ['id', 'title', 'rating', 'category', 'description']

print(filtered_data.head())

   id                  title rating  \
0  s1   Dick Johnson Is Dead  PG-13   
1  s2          Blood & Water  TV-MA   
2  s3              Ganglands  TV-MA   
3  s4  Jailbirds New Orleans  TV-MA   
4  s5           Kota Factory  TV-MA   

                                            category  \
0                                      Documentaries   
1    International TV Shows, TV Dramas, TV Mysteries   
2  Crime TV Shows, International TV Shows, TV Act...   
3                             Docuseries, Reality TV   
4  International TV Shows, Romantic TV Shows, TV ...   

                                         description  
0  As her father nears the end of his life, filmm...  
1  After crossing paths at a party, a Cape Town t...  
2  To protect his family from a powerful drug lor...  
3  Feuds, flirtations and toilet talk go down amo...  
4  In a city of coaching centers known to train I...  


In [3]:
# Apply a lambda function to split the 'category' column into a list of labels, stripping any leading or trailing whitespaces
filtered_data['category'] = filtered_data['category'].apply(lambda x: [label.strip() for label in x.split(',')])

# Create a set containing all unique labels from the 'category' column
all_labels = set(label for labels in filtered_data['category'] for label in labels)

# Print all unique labels
print("All label：", all_labels)

# Iterate through each unique label
for label in all_labels:
    # # Create a new binary column for each unique label, indicating the presence (1) or absence (0) of the label in the 'category' list
    filtered_data[label] = filtered_data['category'].apply(lambda x: 1 if label in x else 0)

# Drop the original 'category' column, as it has been replaced by binary columns for each unique label
filtered_data.drop('category', axis=1, inplace=True)

# print(filtered_data.head())

All label： {'Children & Family Movies', 'Reality TV', 'TV Mysteries', 'Romantic Movies', 'Stand-Up Comedy & Talk Shows', 'TV Dramas', 'Movies', 'Anime Features', 'TV Shows', 'Thrillers', 'Teen TV Shows', 'TV Thrillers', 'TV Sci-Fi & Fantasy', 'Horror Movies', 'Sports Movies', 'Classic Movies', 'Faith & Spirituality', 'Stand-Up Comedy', 'Classic & Cult TV', 'Dramas', 'Anime Series', 'International Movies', 'TV Horror', 'TV Comedies', 'Romantic TV Shows', 'Korean TV Shows', 'Science & Nature TV', "Kids' TV", 'Documentaries', 'International TV Shows', 'Music & Musicals', 'Crime TV Shows', 'Cult Movies', 'Docuseries', 'Comedies', 'Sci-Fi & Fantasy', 'British TV Shows', 'Spanish-Language TV Shows', 'Action & Adventure', 'Independent Movies', 'LGBTQ Movies', 'TV Action & Adventure'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['category'] = filtered_data['category'].apply(lambda x: [label.strip() for label in x.split(',')])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data[label] = filtered_data['category'].apply(lambda x: 1 if label in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fil

In [4]:
# Save multi-tagged processed datasets for text pre-processing and dataset segmentation
output_path = "Task3preprocessed/netflix.tsv"
filtered_data.to_csv(output_path, index=False) 