In [11]:
import pandas as pd


hotstar_data = pd.read_csv(r"C:\Users\Abinaya\OneDrive\Desktop\Netflix-Prime-Hotstar-Dashboard-Power-BI-main\dataset\disney_plus_titles.csv")

print(hotstar_data.isnull().sum())

show_id           0
type              0
title             0
director        473
cast            190
country         219
date_added        3
release_year      0
rating            3
duration          0
listed_in         0
description       0
dtype: int64


In [12]:
# 1. Fill missing 'director' with 'Unknown'
hotstar_data['director'] = hotstar_data['director'].fillna('Unknown')

# 2. Fill missing 'cast' with 'Unknown'
hotstar_data['cast'] = hotstar_data['cast'].fillna('Unknown')

# 3. Fill missing 'country' with 'Unknown'
hotstar_data['country'] = hotstar_data['country'].fillna('Unknown')

# 4. Fill missing 'date_added' with the most common date (mode)
hotstar_data_date_added = hotstar_data['date_added'].mode()[0]
hotstar_data['date_added'] = hotstar_data['date_added'].fillna(hotstar_data_date_added)

# 5. Fill missing 'rating' with the most common rating
netflix_rating = hotstar_data['rating'].mode()[0]
hotstar_data['rating'] = hotstar_data['rating'].fillna(netflix_rating)



# Print statements to verify that the changes have been applied correctly
print("Filled missing 'director' values: ", hotstar_data['director'].isnull().sum())
print("Filled missing 'cast' values: ", hotstar_data['cast'].isnull().sum())
print("Filled missing 'country' values: ", hotstar_data['country'].isnull().sum())
print("Filled missing 'date_added' values: ", hotstar_data['date_added'].isnull().sum())
print("Filled missing 'rating' values: ", hotstar_data['rating'].isnull().sum())


Filled missing 'director' values:  0
Filled missing 'cast' values:  0
Filled missing 'country' values:  0
Filled missing 'date_added' values:  0
Filled missing 'rating' values:  0


In [18]:
import pandas as pd
import numpy as np

# Function to convert duration to minutes
def convert_duration_to_minutes(duration):
    if isinstance(duration, float) and np.isnan(duration):
        # Handle NaN values (leave them as NaN for now)
        return np.nan
    elif 'Season' in duration:
        # Extract the number of seasons and multiply by 125
        seasons = int(duration.split(' ')[0])
        return seasons * 125
    else:
        # Extract the numeric part of the minutes
        return int(duration.split(' ')[0])

# Apply the function to the duration column
hotstar_data['duration_minutes'] = hotstar_data['duration'].apply(convert_duration_to_minutes)



# Verify the result
print(hotstar_data[['duration', 'duration_minutes']].head())

# Drop the original 'duration' column if necessary
hotstar_data.drop('duration', axis=1, inplace=True)

   duration  duration_minutes
0    23 min                23
1    91 min                91
2    23 min                23
3    41 min                41
4  1 Season               125


In [19]:
print(hotstar_data.isnull().sum())

show_id             0
type                0
title               0
director            0
cast                0
country             0
date_added          0
release_year        0
rating              0
listed_in           0
description         0
churn               0
duration_minutes    0
dtype: int64


In [20]:
def create_churn_column(data):
    # Define churn criteria
    churn_condition = data['rating'].isin(['TV-PG', 'TV-14', 'TV-13'])  # Add more criteria as needed
    data['churn'] = np.where(churn_condition, 1, 0)  # 1 for churn, 0 for no churn
    return data

# Apply the function to the netflix data
hotstar_data = create_churn_column(hotstar_data)

In [22]:
output_file_path = r"C:\Users\Abinaya\OneDrive\Documents\ABI\BI Lab\Netflix-Prime-Hotstar-Dashboard-Power-BI-main\dataset\updated_hotstar_data.xlsx"
hotstar_data.to_excel(output_file_path, index=False)

print("Churn column saved to Excel file successfully.")


Churn column saved to Excel file successfully.


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib
rf_model = RandomForestClassifier(random_state=42)
features = hotstar_data[['rating', 'duration_minutes']]  # Add more relevant features if needed
labels = hotstar_data['churn']

# Convert categorical variables into dummy/indicator variables
features = pd.get_dummies(features, columns=['rating'], drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_rf = rf_model.predict(X_test)

# Print the confusion matrix

print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

# Print the classification report
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Confusion Matrix:
[[227   0]
 [  0  63]]
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       227
           1       1.00      1.00      1.00        63

    accuracy                           1.00       290
   macro avg       1.00      1.00      1.00       290
weighted avg       1.00      1.00      1.00       290



In [24]:
joblib.dump(rf_model, 'hotstar_rf_model.pkl')
print("Random Forest model saved successfully.")

Random Forest model saved successfully.


In [25]:
hotstar_data['predicted_churn'] = rf_model.predict(features)  # Make predictions on the original dataset

# Save the updated DataFrame with the predicted churn to a new Excel file
output_file_path = r"C:\Users\Abinaya\OneDrive\Documents\ABI\BI Lab\Netflix-Prime-Hotstar-Dashboard-Power-BI-main\dataset\updated_hotstar_data_with_predictions.xlsx"
hotstar_data.to_excel(output_file_path, index=False)

print("Updated DataFrame with predicted churn saved to Excel file successfully.")

Updated DataFrame with predicted churn saved to Excel file successfully.
