In [None]:
# File: 3_text_preprocessing.ipynb
import pandas as pd

# 1. Load the cleaned data
df = pd.read_csv('cleaned_netflix_data.csv')

# 2. Create the "Metadata Soup" 
# Combining these columns so the model has a broad range of features to look at
df['total_content'] = (df['type'] + ' ' + 
                       df['director'] + ' ' + 
                       df['cast'] + ' ' + 
                       df['country'] + ' ' + 
                       df['listed_in'] + ' ' + 
                       df['description'])

# Making everything lowercase to ensure 'Drama' and 'drama' are treated as the same word.
df['total_content'] = df['total_content'].str.lower()



# 4. Saving this updated version
df.to_csv('processed_netflix_data.csv', index=False)

print("Text combined and lowercased successfully.")

Text combined and lowercased successfully using only Pandas/Python.


In [None]:
#Updated code but check before running
# Notebook: 3_Feature_Engineering_Final.ipynb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

# 1. Load your clustered data
df = pd.read_csv('processed_netflix_data.csv')#cleaned_netflix_data.csv check before using it.

# 2. FEATURE ENGINEERING
# Requirement: Create 'Content Age'
current_year = 2026 # Updated for current year
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')
df['content_age'] = current_year - df['release_year']

# Requirement: Create 'Genre Count'
# We count commas in the 'listed_in' column and add 1
df['genre_count'] = df['listed_in'].apply(lambda x: len(str(x).split(',')))

# 3. VISUALIZATION & INTERPRETATION
# Requirement: Create heatmaps to show correlations
plt.figure(figsize=(10, 6))
# Selecting only numerical columns for correlation
numerical_cols = ['release_year', 'content_age', 'genre_count', 'cluster']
correlation_matrix = df[numerical_cols].corr()

sns.heatmap(correlation_matrix, annot=True, cmap='RdBu_r', center=0)
plt.title('Feature Correlation Heatmap', fontsize=15)
plt.show()

# 4. Save updated data
df.to_csv('final_engineered_netflix_data.csv', index=False)
print("New features 'content_age' and 'genre_count' added and saved.")

In [None]:
#This code can be added in 2nd code cell with updated column.
# Feature Engineering - Adding new numerical features
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')
df['content_age'] = 2025 - df['release_year']

# Genre Count: Counting how many genres are in the 'listed_in' column
df['genre_count'] = df['listed_in'].apply(lambda x: len(x.split(',')))

print(df[['title', 'content_age', 'genre_count']].head())

In [None]:
#Same as above code but this uses re(for cleaning) and nltk(for stopwords).
# File: 3_text_preprocessing.py
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Ensure you have stopwords downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('eye-english'))

# 1. Load the cleaned data from Step 2
df = pd.read_csv('cleaned_netflix_data.csv')

# 2. Define a function to clean the text
def clean_text(text):
    text = text.lower() # Lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove punctuation/special characters
    words = text.split()
    words = [w for w in words if w not in stop_words] # Remove stopwords
    return " ".join(words)

# 3. Create the "Metadata Soup"
# We combine the important columns so the model sees everything related to the content
df['clustering_filter'] = (df['director'] + ' ' + 
                           df['cast'] + ' ' + 
                           df['country'] + ' ' + 
                           df['listed_in'] + ' ' + 
                           df['description'])

# 4. Apply the cleaning function
df['cleaned_filter'] = df['clustering_filter'].apply(clean_text)

print("--- Example of Processed Text (Metadata Soup) ---")
print(df['cleaned_filter'].iloc[0][:200] + "...")

# 5. Save for the Vectorization step
df.to_csv('processed_netflix_data.csv', index=False)