In [6]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [9]:
netflix_titles = pd.read_csv("netflix_titles.csv", encoding='latin1')

In [11]:
# Define preprocessing steps for different types of features
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=1000))  # Example: Using TF-IDF for text features
])

numerical_features = ['release_year']  # Assuming 'release_year' is the only numerical feature
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_features = ['country', 'rating', 'listed_in']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('text', text_transformer, 'description')
    ])

In [12]:
# Define PCA for dimensionality reduction
pca = PCA(n_components=0.95)

In [13]:
# Define classifier
classifier = RandomForestClassifier()

In [14]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('pca', pca),
                           ('classifier', classifier)])

In [16]:
# Split data into features and target variable
X = netflix_titles.drop(columns=['show_id', 'type', 'title', 'director', 'cast', 'date_added', 'duration'])
y = netflix_titles['type']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
from sklearn.decomposition import TruncatedSVD

# Define TruncatedSVD for dimensionality reduction
svd = TruncatedSVD(n_components=100)  # You can adjust the number of components as needed

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('svd', svd),
                           ('classifier', classifier)])


In [None]:
# Fit the pipeline
pipeline.fit(X_train, y_train)
# Transform the test data using the fitted pipeline
X_test_transformed = pipeline.transform(X_test)

In [22]:
accuracy = pipeline.score(X_test, y_test) 
print("Accuracy:", accuracy)

Accuracy: 0.9358683314415437
