### Task 1: Introduction to Isolation Forest
**Description**: Install the necessary library and load a sample dataset.

**Steps**:
1. Install scikit-learn
2. Load a sample dataset using Python

In [1]:
# write your code from here
# Step 1: Install scikit-learn
# If running in a notebook or script, uncomment the line below
# !pip install scikit-learn

# Step 2: Load a sample dataset
from sklearn.datasets import load_iris
import pandas as pd

# Load the iris dataset as an example
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target

print(df.head())


### Task 2: Building an Isolation Forest
**Description**: Initialize an Isolation Forest model and fit it to the Boston dataset.

**Steps**:
1. Initialize Isolation Forest
2. Fit model

In [2]:
# write your code from here
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler

# Step 1: Load Boston dataset
boston = fetch_openml(name='boston', version=1, as_frame=True)
df = boston.frame.dropna()

# Optional: Scale features for better model performance
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.drop('MEDV', axis=1))

# Step 2: Initialize and fit Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(scaled_data)

# Predict anomalies
df['anomaly'] = iso_forest.predict(scaled_data)
df['anomaly'] = df['anomaly'].map({1: 0, -1: 1})  # 1 for anomaly

# View results
print(df[['MEDV', 'anomaly']].head())


### Task 3: Detecting Anomalies
**Description**: Use the fitted Isolation Forest model to predict anomalies.

**Steps**:
1. Predict anomalies
2. Display anomaly counts

In [3]:
# write your code from here
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler

# Load and prepare the Boston dataset
boston = fetch_openml(name='boston', version=1, as_frame=True)
df = boston.frame.dropna()
features = df.drop('MEDV', axis=1)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(features)

# Fit Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(scaled_data)

# Task 3 - Step 1: Predict anomalies
predictions = iso_forest.predict(scaled_data)

# Convert predictions to binary format (1 = anomaly, 0 = normal)
df['anomaly'] = [1 if p == -1 else 0 for p in predictions]

# Task 3 - Step 2: Display anomaly counts
anomaly_counts = df['anomaly'].value_counts()
print("Anomaly Counts:")
print(anomaly_counts)


### Task 4: Visualizing Anomalies
**Description**: Visualize the results to see which samples are considered anomalies.

**Steps**:
1. Plot a scatter plot

In [4]:
# write your code from here
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler

# Load and prepare the Boston dataset
boston = fetch_openml(name='boston', version=1, as_frame=True)
df = boston.frame.dropna()
features = df.drop('MEDV', axis=1)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(features)

# Fit Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(scaled_data)

# Predict anomalies
df['anomaly'] = iso_forest.predict(scaled_data)
df['anomaly'] = df['anomaly'].map({1: 0, -1: 1})

# Step 1: Scatter plot of anomalies (e.g., RM vs MEDV)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='RM', y='MEDV', hue='anomaly', palette={0: 'blue', 1: 'red'})
plt.title("Anomaly Detection in Boston Housing Data (Red = Anomaly)")
plt.xlabel("Average Number of Rooms (RM)")
plt.ylabel("Median Value (MEDV)")
plt.legend(title="Anomaly")
plt.show()


### Task 5: Interpret Contamination Parameter
**Description**: Experiment with different contamination levels.

In [5]:
# write your code from here
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler

# Load and prepare the Boston dataset
boston = fetch_openml(name='boston', version=1, as_frame=True)
df = boston.frame.dropna()
features = df.drop('MEDV', axis=1)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(features)

# Try different contamination levels
contamination_values = [0.01, 0.03, 0.05, 0.1, 0.2]
anomaly_counts = []

for c in contamination_values:
    model = IsolationForest(contamination=c, random_state=42)
    model.fit(scaled_data)
    preds = model.predict(scaled_data)
    anomalies = [1 if p == -1 else 0 for p in preds]
    anomaly_counts.append(sum(anomalies))

# Display results
results = pd.DataFrame({
    'Contamination': contamination_values,
    'Anomalies_Detected': anomaly_counts
})

print(results)

# Optional: Visualize the relationship
plt.figure(figsize=(8, 5))
plt.plot(results['Contamination'], results['Anomalies_Detected'], marker='o')
plt.title('Effect of Contamination Parameter on Anomaly Detection')
plt.xlabel('Contamination Level')
plt.ylabel('Number of Anomalies Detected')
plt.grid(True)
plt.show()
