#**Netflix Titles Analysis Dashboard**

##Objective
Develop an interactive data analysis and prediction system to explore and classify titles in the Netflix titles dataset as either Movies or TV Shows, using data preprocessing, visualization, and machine learning, with a focus on ensuring the model generalizes well without overfitting or underfitting.

## 1. Importing Libraries

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')

###Uploading CSV file

In [2]:
# Step 1: Upload CSV file
from google.colab import files
uploaded = files.upload()

Saving netflix_titles.csv to netflix_titles.csv


### Read the CSV file

In [3]:
# Read the uploaded file
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

## Data Cleaning

In [4]:

# Data cleaning
def clean_data(df):
    # Handling missing values
    df['director'].fillna('Unknown', inplace=True)
    df['cast'].fillna('Unknown', inplace=True)
    df['country'].fillna('Unknown', inplace=True)
    df['date_added'].fillna(df['date_added'].mode()[0], inplace=True)
    df['rating'].fillna(df['rating'].mode()[0], inplace=True)

    # Removing duplicates
    df.drop_duplicates(subset=['show_id'], inplace=True)

    # Cleaning string fields
    df['title'] = df['title'].str.strip()
    df['director'] = df['director'].str.strip()
    df['cast'] = df['cast'].str.strip()
    df['country'] = df['country'].str.strip()
    df['listed_in'] = df['listed_in'].str.strip()

    print("Data cleaned. Missing values handled and duplicates removed.")
    return df

In [5]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Feature Engineering

In [6]:

# Data handling and feature engineering
def feature_engineering(df):
    # Converting date_added to datetime
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

    # Extracting year and month from date_added
    df['year_added'] = df['date_added'].dt.year
    df['month_added'] = df['date_added'].dt.month

    # Calculating content age
    current_year = datetime.now().year
    df['content_age'] = current_year - df['release_year']

    # Converting duration to numeric (for movies: minutes, for TV shows: number of seasons)
    def parse_duration(duration):
        if isinstance(duration, str):
            if 'min' in duration:
                return int(duration.replace(' min', ''))
            elif 'Season' in duration:
                return int(duration.split()[0])
        return np.nan

    df['duration_numeric'] = df['duration'].apply(parse_duration)

    # Counting genres
    df['genre_count'] = df['listed_in'].apply(lambda x: len(x.split(',')))

    print("Feature engineering completed.")
    return df


## Data Visualisation

In [7]:
# Data visualization
def visualize_data(df):
    # Setting plot style
    plt.style.use('seaborn-v0_8')

    # Visualization 1: Distribution of content types
    plt.figure(figsize=(8, 6))
    sns.countplot(data=df, x='type')
    plt.title('Distribution of Content Types (Movies vs TV Shows)')
    plt.savefig('content_type_distribution.png')
    plt.close()

    # Visualization 2: Top 10 genres
    genres = df['listed_in'].str.split(',', expand=True).stack().str.strip().value_counts().head(10)
    plt.figure(figsize=(10, 6))
    genres.plot(kind='bar')
    plt.title('Top 10 Genres on Netflix')
    plt.savefig('top_genres.png')
    plt.close()

    # Visualization 3: Release year distribution
    plt.figure(figsize=(12, 6))
    sns.histplot(df['release_year'], bins=30, kde=True)
    plt.title('Distribution of Release Years')
    plt.savefig('release_year_distribution.png')
    plt.close()

    print("Visualizations generated and saved.")

## Model Building and Training

In [8]:
# Model building and training
def build_and_train_model(df):
    # Preparing features for classification
    features = ['release_year', 'duration_numeric', 'genre_count', 'content_age']
    df_model = df[df['duration_numeric'].notna()][features + ['type']].copy()

    # Encoding target variable
    le = LabelEncoder()
    df_model['type'] = le.fit_transform(df_model['type'])  # Movie: 1, TV Show: 0

    # Splitting data
    X = df_model[features]
    y = df_model['type']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Training logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Evaluating model
    y_pred = model.predict(X_test)
    print("Model Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

    # Saving model and label encoder
    joblib.dump(model, 'logistic_model.pkl')
    joblib.dump(le, 'label_encoder.pkl')

    print("Model trained and saved.")
    return model, le

In [9]:
# Loading and preprocessing data
df = clean_data(df)
df = feature_engineering(df)

# Generating visualizations
visualize_data(df)

# Building and training model
model, le = build_and_train_model(df)

# Saving processed dataset
df.to_csv('processed_netflix_titles.csv', index=False)
print("Processed dataset saved as 'processed_netflix_titles.csv'.")

Data cleaned. Missing values handled and duplicates removed.
Feature engineering completed.
Visualizations generated and saved.
Model Accuracy: 0.9982964224872232
Classification Report:
               precision    recall  f1-score   support

       Movie       1.00      1.00      1.00      1195
     TV Show       0.99      1.00      1.00       566

    accuracy                           1.00      1761
   macro avg       1.00      1.00      1.00      1761
weighted avg       1.00      1.00      1.00      1761

Model trained and saved.
Processed dataset saved as 'processed_netflix_titles.csv'.
