In [1]:
import pandas as pd

file_path = "original/netflix_titles.csv"
original_data = pd.read_csv(file_path)

selected_columns = ['show_id', 'title', 'rating', 'listed_in', 'description']
filtered_data = original_data[selected_columns]

filtered_data.columns = ['id', 'title', 'rating', 'category', 'description']

print(filtered_data.head())

   id                  title rating  \
0  s1   Dick Johnson Is Dead  PG-13   
1  s2          Blood & Water  TV-MA   
2  s3              Ganglands  TV-MA   
3  s4  Jailbirds New Orleans  TV-MA   
4  s5           Kota Factory  TV-MA   

                                            category  \
0                                      Documentaries   
1    International TV Shows, TV Dramas, TV Mysteries   
2  Crime TV Shows, International TV Shows, TV Act...   
3                             Docuseries, Reality TV   
4  International TV Shows, Romantic TV Shows, TV ...   

                                         description  
0  As her father nears the end of his life, filmm...  
1  After crossing paths at a party, a Cape Town t...  
2  To protect his family from a powerful drug lor...  
3  Feuds, flirtations and toilet talk go down amo...  
4  In a city of coaching centers known to train I...  


In [2]:

# 去除标签列中的空格
filtered_data['category'] = filtered_data['category'].apply(lambda x: [label.strip() for label in x.split(',')])

# 获取所有唯一的标签
all_labels = set(label for labels in filtered_data['category'] for label in labels)

# 创建二进制列
for label in all_labels:
    filtered_data[label] = filtered_data['category'].apply(lambda x: 1 if label in x else 0)

# 删除原始的 category 列
filtered_data.drop('category', axis=1, inplace=True)



# 显示保存后的数据集的前几行
print(filtered_data.head())


   id                  title rating  \
0  s1   Dick Johnson Is Dead  PG-13   
1  s2          Blood & Water  TV-MA   
2  s3              Ganglands  TV-MA   
3  s4  Jailbirds New Orleans  TV-MA   
4  s5           Kota Factory  TV-MA   

                                         description  Crime TV Shows  Movies  \
0  As her father nears the end of his life, filmm...               0       0   
1  After crossing paths at a party, a Cape Town t...               0       0   
2  To protect his family from a powerful drug lor...               1       0   
3  Feuds, flirtations and toilet talk go down amo...               0       0   
4  In a city of coaching centers known to train I...               0       0   

   TV Thrillers  Classic & Cult TV  TV Shows  Stand-Up Comedy  ...  \
0             0                  0         0                0  ...   
1             0                  0         0                0  ...   
2             0                  0         0                0  ...   
3   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['category'] = filtered_data['category'].apply(lambda x: [label.strip() for label in x.split(',')])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data[label] = filtered_data['category'].apply(lambda x: 1 if label in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fil

In [3]:
output_path = "preprocessed/netflix.csv"
filtered_data.to_csv(output_path, index=False)

In [4]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [5]:
def preprocess_text(text):

    text = text.lower()
    
    text = re.sub(r'[^\w\s]', '', text)
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)


In [6]:
file_path = "preprocessed/netflix.csv"
preprocessed_data = pd.read_csv(file_path)

preprocessed_data['description'] = preprocessed_data['description'].apply(preprocess_text)

In [7]:
output_path = "preprocessed/netflix_processed.csv"
preprocessed_data.to_csv(output_path, index=False)

print(preprocessed_data.head())

   id                  title rating  \
0  s1   Dick Johnson Is Dead  PG-13   
1  s2          Blood & Water  TV-MA   
2  s3              Ganglands  TV-MA   
3  s4  Jailbirds New Orleans  TV-MA   
4  s5           Kota Factory  TV-MA   

                                         description  Crime TV Shows  Movies  \
0  father near end life filmmak kirsten johnson s...               0       0   
1  cross path parti cape town teen set prove whet...               0       0   
2  protect famili power drug lord skill thief meh...               1       0   
3  feud flirtat toilet talk go among incarcer wom...               0       0   
4  citi coach center known train india finest col...               0       0   

   TV Thrillers  Classic & Cult TV  TV Shows  Stand-Up Comedy  ...  \
0             0                  0         0                0  ...   
1             0                  0         0                0  ...   
2             0                  0         0                0  ...   
3   

In [8]:
from sklearn.model_selection import train_test_split

file_path = "preprocessed/netflix_processed.csv"
preprocessed_data = pd.read_csv(file_path)

In [9]:
shuffled_data = preprocessed_data.sample(frac=1, random_state=42)

train_data, temp_data = train_test_split(shuffled_data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [10]:
train_path = "preprocessed/netflix_train.csv"
val_path = "preprocessed/netflix_val.csv"
test_path = "preprocessed/netflix_test.csv"

train_data.to_csv(train_path, index=False)
val_data.to_csv(val_path, index=False)
test_data.to_csv(test_path, index=False)

# 输出文件中样本数目
print(f"train sample：{len(train_data)}")
print(f"val sample：{len(val_data)}")
print(f"test sample：{len(test_data)}")

train sample：6164
val sample：1321
test sample：1322
