In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Simplify multiple labels into a single label

Since in the original dataset there were multiple category labels for each movie. We decided to keep the first category of each movie, i.e., the main category, as the categorization label.

At the same time, there are categories that are not related to the content of the work, such as International TV Shows|International Movies|Independent Movies, so we removed these categories from the multiple categories.

In [10]:
df = pd.read_csv('original/netflix_titles.csv')

df['listed_in'] = df['listed_in'].str.replace('International TV Shows|International Movies|Independent Movies', '', regex=True)
df['listed_in'] = df['listed_in'].apply(lambda x: x[1:] if isinstance(x, str) and x.startswith(',') else x)
df['listed_in'] = df['listed_in'].str.split(',').str[0]

df.to_csv('original/netflix_titles_single_label.csv', index=False)

## Divided the original dataset

### The division ratio is 7:1.5:1.5

In [11]:
df = pd.read_csv('original/netflix_titles_single_label.csv')

selected_columns = ['show_id', 'listed_in', 'title', 'description','rating']
df_selected = df[selected_columns]

df_selected.columns = ['id', 'category', 'title', 'description','rating']

df_train, df_temp = train_test_split(df_selected, test_size=0.3, random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

Ensure that the categories in the training set, validation set and test set are consistent.
Keep only the categories that are common to the training set, validation set and test set.

In [12]:
unique_categories_train = set(df_train['category'].unique())
unique_categories_val = set(df_val['category'].unique())
unique_categories_test = set(df_test['category'].unique())

common_categories = unique_categories_train.intersection(unique_categories_val, unique_categories_test)

df_train = df_train[df_train['category'].isin(common_categories)]
df_val = df_val[df_val['category'].isin(common_categories)]
df_test = df_test[df_test['category'].isin(common_categories)]

df_train.to_csv('preprocessed/netflix_train.csv', index=False)
df_val.to_csv('preprocessed/netflix_val.csv', index=False)
df_test.to_csv('preprocessed/netflix_test.csv', index=False)

In [13]:
df_train = pd.read_csv('preprocessed/netflix_train.csv')
df_val = pd.read_csv('preprocessed/netflix_val.csv')
df_test = pd.read_csv('preprocessed/netflix_test.csv')

num_samples_train = len(df_train)
num_categories_train = len(df_train['category'].unique())

num_samples_val = len(df_val)
num_categories_val = len(df_val['category'].unique())

num_samples_test = len(df_test)
num_categories_test = len(df_test['category'].unique())

print("Training Set:")
print(f"Number of Samples: {num_samples_train}")
print(f"Number of Categories: {num_categories_train}")

print("\nValidation Set:")
print(f"Number of Samples: {num_samples_val}")
print(f"Number of Categories: {num_categories_val}")

print("\nTest Set:")
print(f"Number of Samples: {num_samples_test}")
print(f"Number of Categories: {num_categories_test}")

Training Set:
Number of Samples: 6144
Number of Categories: 42

Validation Set:
Number of Samples: 1319
Number of Categories: 42

Test Set:
Number of Samples: 1321
Number of Categories: 42


## Simple NLP Preprocessing for each part

In [14]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

In [15]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Croya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Croya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Currently the preprocessing the the simple version. 
! More operations can be implemented !

In [16]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):

    text = text.lower()

    text = text.translate(str.maketrans('', '', string.punctuation))

    words = word_tokenize(text)

    words = [word for word in words if word not in stop_words]

    words = [ps.stem(word) for word in words]

    return ' '.join(words)

In [17]:
df_train['description'] = df_train['description'].apply(preprocess_text)
df_val['description'] = df_val['description'].apply(preprocess_text)
df_test['description'] = df_test['description'].apply(preprocess_text)

df_train['category'] = df_train['category'].str.strip()
df_val['category'] = df_val['category'].str.strip()
df_test['category'] = df_test['category'].str.strip()

df_train.to_csv('preprocessed/netflix_train.csv', index=False)
df_val.to_csv('preprocessed/netflix_val.csv', index=False)
df_test.to_csv('preprocessed/netflix_test.csv', index=False)