In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the dataset
def load_data():
    # Assuming the dataset is already provided as a CSV string
    data = pd.read_csv("Netflix_dataset's.csv")
    return data

In [4]:
# Preprocess the data
def preprocess_data(df):
    # Remove rows with missing critical values
    df = df.dropna(subset=['title', 'genres', 'releaseYear', 'imdbAverageRating', 'imdbNumVotes'])
    
    # Define popularity based on IMDb rating (above median = popular)
    median_rating = df['imdbAverageRating'].median()
    df['is_popular'] = (df['imdbAverageRating'] > median_rating).astype(int)
    
    # Process genres: Split and encode
    df['genres'] = df['genres'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])
    mlb = MultiLabelBinarizer()
    genres_encoded = pd.DataFrame(mlb.fit_transform(df['genres']), columns=mlb.classes_, index=df.index)
    
    # Combine features
    features = pd.concat([df[['releaseYear', 'imdbNumVotes']], genres_encoded], axis=1)
    
    # Handle any remaining NaN values
    features = features.fillna(0)
    
    return features, df['is_popular'], mlb.classes_

In [5]:
# Train the model
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train Random Forest Classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Not Popular', 'Popular']))
    
    return model, X.columns

In [6]:
# Predict popularity for a new show
def predict_popularity(model, features, genres_list, release_year, imdb_votes, genre_columns):
    # Create input data
    input_data = pd.DataFrame({
        'releaseYear': [release_year],
        'imdbNumVotes': [imdb_votes]
    })
    
    # Encode genres
    genres_encoded = np.zeros(len(genre_columns))
    for genre in genres_list:
        if genre in genre_columns:
            genres_encoded[np.where(genre_columns == genre)] = 1
    genres_df = pd.DataFrame([genres_encoded], columns=genre_columns)
    
    # Combine all features
    input_features = pd.concat([input_data, genres_df], axis=1)
    
    # Predict
    prediction = model.predict(input_features)
    probability = model.predict_proba(input_features)[0][1]
    
    return 'Popular' if prediction[0] == 1 else 'Not Popular', probability

In [7]:
# Main execution
if __name__ == '__main__':
    # Load and preprocess data
    df = load_data()
    X, y, genre_columns = preprocess_data(df)
    
    # Train the model
    model, feature_columns = train_model(X, y)
    
    # Example prediction
    new_show_genres = ['Drama', 'Romance']
    new_show_year = 2024
    new_show_votes = 50000
    result, prob = predict_popularity(model, feature_columns, new_show_genres, new_show_year, new_show_votes, genre_columns)
    print(f"\nPrediction for new show (Genres: {new_show_genres}, Year: {new_show_year}, Votes: {new_show_votes}):")
    print(f"Popularity: {result} (Probability of being popular: {prob:.2f})")

Model Accuracy: 0.66

Classification Report:
              precision    recall  f1-score   support

 Not Popular       0.67      0.70      0.68      2110
     Popular       0.65      0.61      0.63      1906

    accuracy                           0.66      4016
   macro avg       0.66      0.66      0.66      4016
weighted avg       0.66      0.66      0.66      4016


Prediction for new show (Genres: ['Drama', 'Romance'], Year: 2024, Votes: 50000):
Popularity: Not Popular (Probability of being popular: 0.15)
