# ML Task 2: Outlier Detection and Handling: Identifying Anomalies in Real-World Datasets

### CASTILLO, ANJELICA M.

#### Import Libraries

In [None]:
import pandas as pd

# Data extracted from the 2023 (colored table)
data_2023 = [
    ["C:1", "RECTO", 24747, 7269, 727, 1678, 593, 666, 115, 49406, 2190, 87391, 2023],
    ["C:2", "MENDOZA", 57755, 11, 81, 3102, 240, 6706, 1218, 74200, 1922, 145235, 2023],
    ["C:2", "PRES. QUIRINO AVE.", 73167, 727, 64, 3837, 364, 7606, 1251, 88445, 2420, 177881, 2023],
    ["C:3", "ARANETA AVE.", 72782, 45, 15, 2418, 248, 3329, 425, 51266, 1694, 132222, 2023],
    ["C:4", "EDSA", 216934, 1740, 2504, 15001, 3824, 2533, 110, 164673, 23, 407342, 2023],
    ["C:5", "C.P. GARCIA / KATIPUNAN AVE. / TANDANG SORA", 127042, 617, 64, 2674, 426, 9654, 702, 89534, 7982, 238696, 2023],
]

# Data extracted from the 2022 (gray table)
data_2022 = [
    ["C:1", "RECTO", 27587, 6749, 756, 3554, 90, 667, 1847, 1854, 86097, 2022],
    ["C:2", "MENDOZA", 83148, 53, 81, 5384, 155, 3788, 112, 80902, 1900, 175243, 2022],
    ["C:2", "PRES. QUIRINO AVE.", 68697, 818, 209, 3198, 153, 7037, 173, 70437, 1973, 147617, 2022],
    ["C:3", "ARANETA AVE.", 74142, 14, 16, 2502, 117, 3150, 507, 44177, 1570, 112390, 2022],
    ["C:4", "EDSA", 202768, 1803, 2475, 11868, 5888, 2933, 119, 151847, 20, 385319, 2022],
    ["C:5", "C.P. GARCIA / KATIPUNAN AVE. / TANDANG SORA", 130465, 2702, 74, 2980, 798, 9284, 719, 97363, 8325, 315916, 2022],
]

# Define column names
columns = ["Code", "Road Name", "CAR", "PUJ", "UV", "TAXI", "PUB", "TRUCK", "TRAILER", "MC", "TRICYCLE", "TOTAL", "Year"]

# Create DataFrame
df_traffic = pd.DataFrame(data_2023 + data_2022, columns=columns)

# Save to CSV
csv_path = "/Downloads/AADT_Traffic_2022_2023.csv"
df_traffic.to_csv(csv_path, index=False)

In [None]:
import pandas as pd  # For data manipulation
import numpy as np   # For numerical operations
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import seaborn as sns


from scipy import stats
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans, DBSCAN
import plotly.express as px


from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PowerTransformer


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

#### Load the dataset

In [None]:
df = pd.read_csv(r"C:\Users\Anjel\Downloads\netflix_titles.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Extract numerical features
numeric_columns = ["release_year"]  # Modify if duration can be converted to numeric
if 'duration' in df.columns:
    df['duration_numeric'] = df['duration'].str.extract(r'(\d+)').astype(float)
    numeric_columns.append("duration_numeric")

# 1. Detect outliers using statistical methods
def detect_outliers_stat(df, columns):
    outliers = {}
    
    # Z-score method
    z_scores = np.abs(stats.zscore(df[columns]))
    df['zscore_outlier'] = (z_scores > 3).any(axis=1)
    outliers['zscore'] = df[df['zscore_outlier']]
    
    # IQR method
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    df['iqr_outlier'] = ((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)
    outliers['iqr'] = df[df['iqr_outlier']]
    
    return outliers

outliers_stat = detect_outliers_stat(df, numeric_columns)

# 2. Detect outliers using machine learning methods
def detect_outliers_ml(df, columns):
    results = {}
    
    # Drop NaNs or fill them with a suitable strategy (e.g., median)
    df_filtered = df[columns].dropna()
    
    # Isolation Forest
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    df_filtered['iso_forest_outlier'] = iso_forest.fit_predict(df_filtered)
    results['iso_forest'] = df_filtered[df_filtered['iso_forest_outlier'] == -1]
    
    # Local Outlier Factor
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
    df_filtered['lof_outlier'] = lof.fit_predict(df_filtered)
    results['lof'] = df_filtered[df_filtered['lof_outlier'] == -1]
    
    # PCA-Based Outliers
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(df_filtered)
    df_filtered['pca_component_1'] = pca_result[:, 0]
    df_filtered['pca_component_2'] = pca_result[:, 1]
    
    # XGBoost-based anomaly detection
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(df_filtered, (df_filtered['iso_forest_outlier'] == -1).astype(int))
    df_filtered['xgb_outlier'] = xgb_model.predict(df_filtered)
    results['xgb'] = df_filtered[df_filtered['xgb_outlier'] == 1]
    
    return results

outliers_ml = detect_outliers_ml(df, numeric_columns)

# Print outliers only
print("Z-Score Outliers:")
print(outliers_stat['zscore'])
print("\nIQR Outliers:")
print(outliers_stat['iqr'])
print("\nIsolation Forest Outliers:")
print(outliers_ml['iso_forest'])
print("\nLOF Outliers:")
print(outliers_ml['lof'])
print("\nXGBoost Outliers:")
print(outliers_ml['xgb'])

# Visualization of Outlier Counts
outlier_counts = {
    "Z-Score": len(outliers_stat['zscore']),
    "IQR": len(outliers_stat['iqr']),
    "Isolation Forest": len(outliers_ml['iso_forest']),
    "LOF": len(outliers_ml['lof']),
    "XGBoost": len(outliers_ml['xgb'])
}

fig = px.bar(x=outlier_counts.keys(), y=outlier_counts.values(), title="Outlier Counts per Detection Method", labels={'x': "Method", 'y': "Number of Outliers"})
fig.show()


Z-Score Outliers:
     show_id   type                                title  \
41       s42  Movie                                 Jaws   
42       s43  Movie                               Jaws 2   
43       s44  Movie                               Jaws 3   
44       s45  Movie                    Jaws: The Revenge   
131     s132  Movie          Blade Runner: The Final Cut   
...      ...    ...                                  ...   
8739   s8740  Movie   Why We Fight: The Battle of Russia   
8745   s8746  Movie  Willy Wonka & the Chocolate Factory   
8748   s8749  Movie                 Winter of Our Dreams   
8763   s8764  Movie      WWII: Report from the Aleutians   
8792   s8793  Movie                          Young Tiger   

                         director  \
41               Steven Spielberg   
42                 Jeannot Szwarc   
43                      Joe Alves   
44                 Joseph Sargent   
131                  Ridley Scott   
...                           ...   
87

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Extract numerical features
numeric_columns = ["release_year"]  # Modify if duration can be converted to numeric
if 'duration' in df.columns:
    df['duration_numeric'] = df['duration'].str.extract(r'(\d+)').astype(float)
    numeric_columns.append("duration_numeric")

# 1. Detect outliers using statistical methods
def detect_outliers_stat(df, columns):
    outliers = {}
    
    # Z-score method
    z_scores = np.abs(stats.zscore(df[columns]))
    df['zscore_outlier'] = (z_scores > 3).any(axis=1)
    outliers['zscore'] = df[df['zscore_outlier']]
    
    # IQR method
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    df['iqr_outlier'] = ((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)
    outliers['iqr'] = df[df['iqr_outlier']]
    
    return outliers

outliers_stat = detect_outliers_stat(df, numeric_columns)

# 2. Detect outliers using machine learning methods
def detect_outliers_ml(df, columns):
    results = {}
    
    df_filtered = df[columns].dropna()
    
    # Isolation Forest
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    df_filtered['iso_forest_outlier'] = iso_forest.fit_predict(df_filtered)
    results['iso_forest'] = df_filtered[df_filtered['iso_forest_outlier'] == -1]
    
    # Local Outlier Factor
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
    df_filtered['lof_outlier'] = lof.fit_predict(df_filtered)
    results['lof'] = df_filtered[df_filtered['lof_outlier'] == -1]
    
    # PCA-Based Outliers
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(df_filtered)
    df_filtered['pca_component_1'] = pca_result[:, 0]
    df_filtered['pca_component_2'] = pca_result[:, 1]
    
    # XGBoost-based anomaly detection
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(df_filtered, (df_filtered['iso_forest_outlier'] == -1).astype(int))
    df_filtered['xgb_outlier'] = xgb_model.predict(df_filtered)
    results['xgb'] = df_filtered[df_filtered['xgb_outlier'] == 1]
    
    return results

outliers_ml = detect_outliers_ml(df, numeric_columns)

# Print outliers only
print("Z-Score Outliers:")
print(outliers_stat['zscore'])
print("\nIQR Outliers:")
print(outliers_stat['iqr'])
print("\nIsolation Forest Outliers:")
print(outliers_ml['iso_forest'])
print("\nLOF Outliers:")
print(outliers_ml['lof'])
print("\nXGBoost Outliers:")
print(outliers_ml['xgb'])

# Visualization of Outlier Counts
outlier_counts = {
    "Z-Score": len(outliers_stat['zscore']),
    "IQR": len(outliers_stat['iqr']),
    "Isolation Forest": len(outliers_ml['iso_forest']),
    "LOF": len(outliers_ml['lof']),
    "XGBoost": len(outliers_ml['xgb'])
}

fig = px.bar(x=outlier_counts.keys(), y=outlier_counts.values(), title="Outlier Counts per Detection Method", labels={'x': "Method", 'y': "Number of Outliers"})
fig.show()

# Visualization for each detection method
for method, outliers in outliers_ml.items():
    if not outliers.empty:
        fig = px.scatter(outliers, x="release_year", y="duration_numeric", title=f"Outliers detected by {method}", labels={'release_year': "Release Year", 'duration_numeric': "Duration"})
        fig.show()

for method, outliers in outliers_stat.items():
    if not outliers.empty:
        fig = px.scatter(outliers, x="release_year", y="duration_numeric", title=f"Outliers detected by {method}", labels={'release_year': "Release Year", 'duration_numeric': "Duration"})
        fig.show()

Z-Score Outliers:
     show_id   type                                title  \
41       s42  Movie                                 Jaws   
42       s43  Movie                               Jaws 2   
43       s44  Movie                               Jaws 3   
44       s45  Movie                    Jaws: The Revenge   
131     s132  Movie          Blade Runner: The Final Cut   
...      ...    ...                                  ...   
8739   s8740  Movie   Why We Fight: The Battle of Russia   
8745   s8746  Movie  Willy Wonka & the Chocolate Factory   
8748   s8749  Movie                 Winter of Our Dreams   
8763   s8764  Movie      WWII: Report from the Aleutians   
8792   s8793  Movie                          Young Tiger   

                         director  \
41               Steven Spielberg   
42                 Jeannot Szwarc   
43                      Joe Alves   
44                 Joseph Sargent   
131                  Ridley Scott   
...                           ...   
87

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


df = pd.read_csv(r"C:\Users\Anjel\Downloads\netflix_titles.csv")

# Extract numerical features
numeric_columns = ["release_year"]  # Modify if duration can be converted to numeric
if 'duration' in df.columns:
    df['duration_numeric'] = df['duration'].str.extract(r'(\d+)').astype(float)
    numeric_columns.append("duration_numeric")

# 1. Detect outliers using statistical methods
def detect_outliers_stat(df, columns):
    outliers = {}
    
    # Z-score method
    z_scores = np.abs(stats.zscore(df[columns]))
    df['zscore_outlier'] = (z_scores > 3).any(axis=1)
    outliers['zscore'] = df[df['zscore_outlier']]
    
    # IQR method
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    df['iqr_outlier'] = ((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)
    outliers['iqr'] = df[df['iqr_outlier']]
    
    return outliers

outliers_stat = detect_outliers_stat(df, numeric_columns)

# 2. Detect outliers using machine learning methods
def detect_outliers_ml(df, columns):
    results = {}
    
    df_filtered = df[columns].dropna()
    
    # Isolation Forest
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    df_filtered['iso_forest_outlier'] = iso_forest.fit_predict(df_filtered)
    results['iso_forest'] = df_filtered[df_filtered['iso_forest_outlier'] == -1]
    
    # Local Outlier Factor
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
    df_filtered['lof_outlier'] = lof.fit_predict(df_filtered)
    results['lof'] = df_filtered[df_filtered['lof_outlier'] == -1]
    
    # PCA-Based Outliers
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(df_filtered)
    df_filtered['pca_component_1'] = pca_result[:, 0]
    df_filtered['pca_component_2'] = pca_result[:, 1]
    
    # XGBoost-based anomaly detection
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(df_filtered, (df_filtered['iso_forest_outlier'] == -1).astype(int))
    df_filtered['xgb_outlier'] = xgb_model.predict(df_filtered)
    results['xgb'] = df_filtered[df_filtered['xgb_outlier'] == 1]
    
    return results

outliers_ml = detect_outliers_ml(df, numeric_columns)

# Print outliers only
print("Z-Score Outliers:")
print(outliers_stat['zscore'])
print("\nIQR Outliers:")
print(outliers_stat['iqr'])
print("\nIsolation Forest Outliers:")
print(outliers_ml['iso_forest'])
print("\nLOF Outliers:")
print(outliers_ml['lof'])
print("\nXGBoost Outliers:")
print(outliers_ml['xgb'])

# Visualization of Outlier Counts
outlier_counts = {
    "Z-Score": len(outliers_stat['zscore']),
    "IQR": len(outliers_stat['iqr']),
    "Isolation Forest": len(outliers_ml['iso_forest']),
    "LOF": len(outliers_ml['lof']),
    "XGBoost": len(outliers_ml['xgb'])
}

fig = px.bar(x=outlier_counts.keys(), y=outlier_counts.values(), title="Outlier Counts per Detection Method", labels={'x': "Method", 'y': "Number of Outliers"})
fig.show()

# Visualization for each detection method with clustering/circles
for method, outliers in outliers_ml.items():
    if not outliers.empty:
        fig = px.scatter(outliers, x="release_year", y="duration_numeric", title=f"Outliers detected by {method}", labels={'release_year': "Release Year", 'duration_numeric': "Duration"})
        fig.update_traces(marker=dict(size=10, color='red', symbol='circle-open'))
        fig.show()

for method, outliers in outliers_stat.items():
    if not outliers.empty:
        fig = px.scatter(outliers, x="release_year", y="duration_numeric", title=f"Outliers detected by {method}", labels={'release_year': "Release Year", 'duration_numeric': "Duration"})
        fig.update_traces(marker=dict(size=10, color='blue', symbol='circle-open'))
        fig.show()

Z-Score Outliers:
     show_id   type                                title  \
41       s42  Movie                                 Jaws   
42       s43  Movie                               Jaws 2   
43       s44  Movie                               Jaws 3   
44       s45  Movie                    Jaws: The Revenge   
131     s132  Movie          Blade Runner: The Final Cut   
...      ...    ...                                  ...   
8739   s8740  Movie   Why We Fight: The Battle of Russia   
8745   s8746  Movie  Willy Wonka & the Chocolate Factory   
8748   s8749  Movie                 Winter of Our Dreams   
8763   s8764  Movie      WWII: Report from the Aleutians   
8792   s8793  Movie                          Young Tiger   

                         director  \
41               Steven Spielberg   
42                 Jeannot Szwarc   
43                      Joe Alves   
44                 Joseph Sargent   
131                  Ridley Scott   
...                           ...   
87

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans, DBSCAN



# Extract numerical features
numeric_columns = ["release_year"]  # Modify if duration can be converted to numeric
if 'duration' in df.columns:
    df['duration_numeric'] = df['duration'].str.extract(r'(\d+)').astype(float)
    numeric_columns.append("duration_numeric")

# 1. Detect outliers using statistical methods
def detect_outliers_stat(df, columns):
    outliers = {}
    
    # Z-score method
    z_scores = np.abs(stats.zscore(df[columns]))
    df['zscore_outlier'] = (z_scores > 3).any(axis=1)
    outliers['zscore'] = df[df['zscore_outlier']]
    
    # IQR method
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    df['iqr_outlier'] = ((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)
    outliers['iqr'] = df[df['iqr_outlier']]
    
    return outliers

outliers_stat = detect_outliers_stat(df, numeric_columns)

# 2. Detect outliers using machine learning methods
def detect_outliers_ml(df, columns):
    results = {}
    
    df_filtered = df[columns].dropna()
    
    # Isolation Forest
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    df_filtered['iso_forest_outlier'] = iso_forest.fit_predict(df_filtered)
    results['iso_forest'] = df_filtered[df_filtered['iso_forest_outlier'] == -1]
    
    # Local Outlier Factor
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
    df_filtered['lof_outlier'] = lof.fit_predict(df_filtered)
    results['lof'] = df_filtered[df_filtered['lof_outlier'] == -1]
    
    # PCA-Based Outliers
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(df_filtered)
    df_filtered['pca_component_1'] = pca_result[:, 0]
    df_filtered['pca_component_2'] = pca_result[:, 1]
    
    # XGBoost-based anomaly detection
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(df_filtered, (df_filtered['iso_forest_outlier'] == -1).astype(int))
    df_filtered['xgb_outlier'] = xgb_model.predict(df_filtered)
    results['xgb'] = df_filtered[df_filtered['xgb_outlier'] == 1]
    
    return results

outliers_ml = detect_outliers_ml(df, numeric_columns)

# Print outliers only
print("Z-Score Outliers:")
print(outliers_stat['zscore'])
print("\nIQR Outliers:")
print(outliers_stat['iqr'])
print("\nIsolation Forest Outliers:")
print(outliers_ml['iso_forest'])
print("\nLOF Outliers:")
print(outliers_ml['lof'])
print("\nXGBoost Outliers:")
print(outliers_ml['xgb'])

# Visualization of Outlier Counts
outlier_counts = {
    "Z-Score": len(outliers_stat['zscore']),
    "IQR": len(outliers_stat['iqr']),
    "Isolation Forest": len(outliers_ml['iso_forest']),
    "LOF": len(outliers_ml['lof']),
    "XGBoost": len(outliers_ml['xgb'])
}

fig = px.bar(x=outlier_counts.keys(), y=outlier_counts.values(), title="Outlier Counts per Detection Method", labels={'x': "Method", 'y': "Number of Outliers"})
fig.show()

# Visualization for each detection method with clustering
for method, outliers in outliers_ml.items():
    if not outliers.empty:
        kmeans = KMeans(n_clusters=min(3, len(outliers)), random_state=42)
        outliers['cluster'] = kmeans.fit_predict(outliers[numeric_columns])
        fig = px.scatter(outliers, x="release_year", y="duration_numeric", color='cluster', title=f"Clustered Outliers detected by {method}", labels={'release_year': "Release Year", 'duration_numeric': "Duration"})
        fig.show()

for method, outliers in outliers_stat.items():
    if not outliers.empty:
        kmeans = KMeans(n_clusters=min(3, len(outliers)), random_state=42)
        outliers['cluster'] = kmeans.fit_predict(outliers[numeric_columns])
        fig = px.scatter(outliers, x="release_year", y="duration_numeric", color='cluster', title=f"Clustered Outliers detected by {method}", labels={'release_year': "Release Year", 'duration_numeric': "Duration"})
        fig.show()


Z-Score Outliers:
     show_id   type                                title  \
41       s42  Movie                                 Jaws   
42       s43  Movie                               Jaws 2   
43       s44  Movie                               Jaws 3   
44       s45  Movie                    Jaws: The Revenge   
131     s132  Movie          Blade Runner: The Final Cut   
...      ...    ...                                  ...   
8739   s8740  Movie   Why We Fight: The Battle of Russia   
8745   s8746  Movie  Willy Wonka & the Chocolate Factory   
8748   s8749  Movie                 Winter of Our Dreams   
8763   s8764  Movie      WWII: Report from the Aleutians   
8792   s8793  Movie                          Young Tiger   

                         director  \
41               Steven Spielberg   
42                 Jeannot Szwarc   
43                      Joe Alves   
44                 Joseph Sargent   
131                  Ridley Scott   
...                           ...   
87




Could not find the number of physical cores for the following reason:
found 0 physical cores < 1

  File "c:\Users\Anjel\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.






KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.






KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.






KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy






KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans, DBSCAN


# Extract numerical features
numeric_columns = ["release_year"]  # Modify if duration can be converted to numeric
if 'duration' in df.columns:
    df['duration_numeric'] = df['duration'].str.extract(r'(\d+)').astype(float)
    numeric_columns.append("duration_numeric")

# 1. Detect outliers using statistical methods
def detect_outliers_stat(df, columns):
    outliers = {}
    
    # Z-score method
    z_scores = np.abs(stats.zscore(df[columns]))
    df['zscore_outlier'] = (z_scores > 10).any(axis=1)
    outliers['zscore'] = df[df['zscore_outlier']]
    
    # IQR method
    Q1 = df[columns].quantile(0.10)
    Q3 = df[columns].quantile(0.90)
    IQR = Q3 - Q1
    df['iqr_outlier'] = ((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)
    outliers['iqr'] = df[df['iqr_outlier']]
    
    return outliers

outliers_stat = detect_outliers_stat(df, numeric_columns)

# 2. Detect outliers using machine learning methods
def detect_outliers_ml(df, columns):
    results = {}
    
    df_filtered = df[columns].dropna()
    
    # Isolation Forest
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    df_filtered['iso_forest_outlier'] = iso_forest.fit_predict(df_filtered)
    results['iso_forest'] = df_filtered[df_filtered['iso_forest_outlier'] == -1]
    
    # Local Outlier Factor
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
    df_filtered['lof_outlier'] = lof.fit_predict(df_filtered)
    results['lof'] = df_filtered[df_filtered['lof_outlier'] == -1]
    
    # PCA-Based Outliers
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(df_filtered)
    df_filtered['pca_component_1'] = pca_result[:, 0]
    df_filtered['pca_component_2'] = pca_result[:, 1]
    
    # XGBoost-based anomaly detection
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(df_filtered, (df_filtered['iso_forest_outlier'] == -1).astype(int))
    df_filtered['xgb_outlier'] = xgb_model.predict(df_filtered)
    results['xgb'] = df_filtered[df_filtered['xgb_outlier'] == 1]
    
    return results

outliers_ml = detect_outliers_ml(df, numeric_columns)

# Print outliers only
print("Z-Score Outliers:")
print(outliers_stat['zscore'])
print("\nIQR Outliers:")
print(outliers_stat['iqr'])
print("\nIsolation Forest Outliers:")
print(outliers_ml['iso_forest'])
print("\nLOF Outliers:")
print(outliers_ml['lof'])
print("\nXGBoost Outliers:")
print(outliers_ml['xgb'])

# Visualization of Outlier Counts
outlier_counts = {
    "Z-Score": len(outliers_stat['zscore']),
    "IQR": len(outliers_stat['iqr']),
    "Isolation Forest": len(outliers_ml['iso_forest']),
    "LOF": len(outliers_ml['lof']),
    "XGBoost": len(outliers_ml['xgb'])
}

fig = px.bar(x=outlier_counts.keys(), y=outlier_counts.values(), title="Outlier Counts per Detection Method", labels={'x': "Method", 'y': "Number of Outliers"})
fig.show()

# Visualization for each detection method with clustering
for method, outliers in outliers_ml.items():
    if not outliers.empty:
        kmeans = KMeans(n_clusters=min(3, len(outliers)), random_state=42)
        outliers['cluster'] = kmeans.fit_predict(outliers[numeric_columns])
        df['outlier_label'] = "Included"
        df.loc[outliers.index, 'outlier_label'] = "Outlier"
        fig = px.scatter(df, x="release_year", y="duration_numeric", color='outlier_label', title=f"Clustered Outliers detected by {method}", labels={'release_year': "Release Year", 'duration_numeric': "Duration"})
        fig.show()

for method, outliers in outliers_stat.items():
    if not outliers.empty:
        kmeans = KMeans(n_clusters=min(3, len(outliers)), random_state=42)
        outliers['cluster'] = kmeans.fit_predict(outliers[numeric_columns])
        df['outlier_label'] = "Included"
        df.loc[outliers.index, 'outlier_label'] = "Outlier"
        fig = px.scatter(df, x="release_year", y="duration_numeric", color='outlier_label', title=f"Clustered Outliers detected by {method}", labels={'release_year': "Release Year", 'duration_numeric': "Duration"})
        fig.show()

Z-Score Outliers:
     show_id     type                              title director cast  \
4250   s4251  TV Show  Pioneers: First Women Filmmakers*      NaN  NaN   

     country         date_added  release_year rating  duration listed_in  \
4250     NaN  December 30, 2018          1925  TV-14  1 Season  TV Shows   

                                            description  duration_numeric  \
4250  This collection restores films from women who ...               1.0   

      zscore_outlier  iqr_outlier  
4250            True         True  

IQR Outliers:
     show_id   type                                title  \
41       s42  Movie                                 Jaws   
42       s43  Movie                               Jaws 2   
43       s44  Movie                               Jaws 3   
131     s132  Movie          Blade Runner: The Final Cut   
166     s167  Movie          Once Upon a Time in America   
...      ...    ...                                  ...   
8739   s8740  Movi




Could not find the number of physical cores for the following reason:
found 0 physical cores < 1

  File "c:\Users\Anjel\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.






KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.






KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.






KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy






KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

