In [None]:
# Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
import seaborn as sns

# Load your dataset
file_path = 'dataset.csv'  
data = pd.read_csv(file_path)

# Define postpaid and prepaid column names
postpaid_columns = [col for col in data.columns if 'post_' in col]
prepaid_columns = [col for col in data.columns if 'pre_' in col]

# Convert relevant columns to numeric and handle errors
data[postpaid_columns + prepaid_columns] = data[postpaid_columns + prepaid_columns].apply(pd.to_numeric, errors='coerce')

# Drop rows with excessive NaNs (optional thresholding to remove rows with too many missing values)
threshold = int(len(postpaid_columns + prepaid_columns) * 0.5)
data_cleaned = data.dropna(thresh=threshold)

# Select consumption-related columns for anomaly detection (postpaid and prepaid months)
consumption_data = data_cleaned[postpaid_columns + prepaid_columns]

# Handle missing values by filling with the median value
consumption_data = consumption_data.fillna(consumption_data.median())

# Initialize Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fit the model to the consumption data
iso_forest.fit(consumption_data)

# Predict anomalies (-1 indicates anomaly, 1 indicates normal)
anomaly_labels = iso_forest.predict(consumption_data)

# Calculate anomaly scores (lower scores are more likely to be anomalies)
anomaly_scores = iso_forest.decision_function(consumption_data)

# Add anomaly labels and scores to the cleaned dataset
data_cleaned['anomaly_label'] = anomaly_labels
data_cleaned['anomaly_score'] = anomaly_scores

# Summarize the number of anomalies detected
num_anomalies = sum(anomaly_labels == -1)
print(f"Number of anomalies detected: {num_anomalies}")

# Distribution of anomaly scores
print(data_cleaned['anomaly_score'].describe())

# Identifying customers who are flagged as anomalous most frequently
anomalous_data = data_cleaned[data_cleaned['anomaly_label'] == -1]
anomaly_frequencies = anomalous_data.groupby('Customer No')['anomaly_label'].count()

# Sort customers by frequency of anomalies
frequent_anomalies = anomaly_frequencies.sort_values(ascending=False)

# Extract top 5 anomalous customers
top_customers = frequent_anomalies.head(5).index
top_customers_data = data_cleaned[data_cleaned['Customer No'].isin(top_customers)]

# Plot consumption trends for the top 5 anomalous customers
plt.figure(figsize=(12, 8))
for customer in top_customers:
    customer_data = top_customers_data[top_customers_data['Customer No'] == customer]
    plt.plot(postpaid_columns, customer_data[postpaid_columns].values.flatten(), label=f'Customer {customer} (Postpaid)', marker='o')
    plt.plot(prepaid_columns, customer_data[prepaid_columns].values.flatten(), label=f'Customer {customer} (Prepaid)', marker='x')

plt.title('Consumption Trends for Top Anomalous Customers (Postpaid vs Prepaid)')
plt.xlabel('Month')
plt.ylabel('Consumption (units)')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# Anomalies per month for postpaid and prepaid periods
anomalies_per_month_postpaid = anomalous_data[postpaid_columns].notna().sum()
anomalies_per_month_prepaid = anomalous_data[prepaid_columns].notna().sum()

# Plotting the number of anomalies per month
plt.figure(figsize=(10, 6))
plt.bar(postpaid_columns, anomalies_per_month_postpaid, label='Postpaid Anomalies', alpha=0.7, color='blue')
plt.bar(prepaid_columns, anomalies_per_month_prepaid, label='Prepaid Anomalies', alpha=0.7, color='orange')
plt.xticks(rotation=45)
plt.title('Number of Anomalies Detected Per Month (Postpaid vs Prepaid)')
plt.xlabel('Month')
plt.ylabel('Number of Anomalies')
plt.legend()
plt.tight_layout()
plt.show()

# Save anomalies to a CSV file (optional)
anomalous_data.to_csv('anomalies_detected.csv', index=False)


In [None]:
# Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
import seaborn as sns

# Load your dataset (raw or pre-processed data)
file_path = 'dataset.csv'  # Update this with the correct path to your dataset
data = pd.read_csv(file_path)

# Define postpaid and prepaid column names
postpaid_columns = [col for col in data.columns if 'post_' in col]
prepaid_columns = [col for col in data.columns if 'pre_' in col]

# Convert relevant columns to numeric and handle errors
data[postpaid_columns + prepaid_columns] = data[postpaid_columns + prepaid_columns].apply(pd.to_numeric, errors='coerce')

# Drop rows with excessive NaNs (optional thresholding to remove rows with too many missing values)
threshold = int(len(postpaid_columns + prepaid_columns) * 0.5)
data_cleaned = data.dropna(thresh=threshold)

# Select consumption-related columns for anomaly detection (postpaid and prepaid months)
consumption_data = data_cleaned[postpaid_columns + prepaid_columns]

# Handle missing values by filling with the median value
consumption_data = consumption_data.fillna(consumption_data.median())

# Initialize Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fit the model to the consumption data
iso_forest.fit(consumption_data)

# Predict anomalies (-1 indicates anomaly, 1 indicates normal)
anomaly_labels = iso_forest.predict(consumption_data)

# Calculate anomaly scores (lower scores are more likely to be anomalies)
anomaly_scores = iso_forest.decision_function(consumption_data)

# Add anomaly labels and scores to the cleaned dataset
data_cleaned['anomaly_label'] = anomaly_labels
data_cleaned['anomaly_score'] = anomaly_scores

# Summarize the number of anomalies detected
num_anomalies = sum(anomaly_labels == -1)
print(f"Number of anomalies detected: {num_anomalies}")

# Distribution of anomaly scores
print(data_cleaned['anomaly_score'].describe())

# Identify customers who are flagged as anomalous most frequently
anomalous_data = data_cleaned[data_cleaned['anomaly_label'] == -1]
anomaly_frequencies = anomalous_data.groupby('Customer No')['anomaly_label'].count()

# Sort customers by frequency of anomalies
frequent_anomalies = anomaly_frequencies.sort_values(ascending=False)

# Extract top 5 anomalous customers
top_customers = frequent_anomalies.head(5).index
top_customers_data = data_cleaned[data_cleaned['Customer No'].isin(top_customers)]

# Plot consumption trends for the top 5 anomalous customers
plt.figure(figsize=(12, 8))
for customer in top_customers:
    customer_data = top_customers_data[top_customers_data['Customer No'] == customer]
    plt.plot(postpaid_columns, customer_data[postpaid_columns].values.flatten(), label=f'Customer {customer} (Postpaid)', marker='o')
    plt.plot(prepaid_columns, customer_data[prepaid_columns].values.flatten(), label=f'Customer {customer} (Prepaid)', marker='x')

plt.title('Consumption Trends for Top Anomalous Customers (Postpaid vs Prepaid)')
plt.xlabel('Month')
plt.ylabel('Consumption (units)')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# Anomalies per month for postpaid and prepaid periods
anomalies_per_month_postpaid = anomalous_data[postpaid_columns].notna().sum()
anomalies_per_month_prepaid = anomalous_data[prepaid_columns].notna().sum()

# Plotting the number of anomalies per month
plt.figure(figsize=(10, 6))
plt.bar(postpaid_columns, anomalies_per_month_postpaid, label='Postpaid Anomalies', alpha=0.7, color='blue')
plt.bar(prepaid_columns, anomalies_per_month_prepaid, label='Prepaid Anomalies', alpha=0.7, color='orange')
plt.xticks(rotation=45)
plt.title('Number of Anomalies Detected Per Month (Postpaid vs Prepaid)')
plt.xlabel('Month')
plt.ylabel('Number of Anomalies')
plt.legend()
plt.tight_layout()
plt.show()

# Save anomalies to a CSV file (optional)
anomalous_data.to_csv('anomalies_detected.csv', index=False)


In [None]:
# Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
import seaborn as sns

# Load your dataset (raw or pre-processed data)
file_path = 'dataset.csv'  # Update this with the correct path to your dataset
data = pd.read_csv(file_path)

# Define postpaid and prepaid column names
postpaid_columns = [col for col in data.columns if 'post_' in col]
prepaid_columns = [col for col in data.columns if 'pre_' in col]

# Convert relevant columns to numeric and handle errors
data[postpaid_columns + prepaid_columns] = data[postpaid_columns + prepaid_columns].apply(pd.to_numeric, errors='coerce')

# Drop rows with excessive NaNs (optional thresholding to remove rows with too many missing values)
threshold = int(len(postpaid_columns + prepaid_columns) * 0.5)
data_cleaned = data.dropna(thresh=threshold)

# Select consumption-related columns for anomaly detection (postpaid and prepaid months)
consumption_data = data_cleaned[postpaid_columns + prepaid_columns]

# Handle missing values by filling with the median value
consumption_data = consumption_data.fillna(consumption_data.median())

# Calculate Exponential Moving Average (EMA) for smoothing
ema_window = 3  # Choose the EMA window size (e.g., 3 for a short-term smoothing)
smoothed_data = consumption_data.apply(lambda x: x.ewm(span=ema_window, adjust=False).mean())

# Initialize Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fit the model to the EMA-smoothed consumption data
iso_forest.fit(smoothed_data)

# Predict anomalies (-1 indicates anomaly, 1 indicates normal)
anomaly_labels = iso_forest.predict(smoothed_data)

# Calculate anomaly scores (lower scores are more likely to be anomalies)
anomaly_scores = iso_forest.decision_function(smoothed_data)

# Add anomaly labels and scores to the cleaned dataset
data_cleaned['anomaly_label'] = anomaly_labels
data_cleaned['anomaly_score'] = anomaly_scores

# Summarize the number of anomalies detected
num_anomalies = sum(anomaly_labels == -1)
print(f"Number of anomalies detected: {num_anomalies}")

# Distribution of anomaly scores
print(data_cleaned['anomaly_score'].describe())

# Identify customers who are flagged as anomalous most frequently
anomalous_data = data_cleaned[data_cleaned['anomaly_label'] == -1]
anomaly_frequencies = anomalous_data.groupby('Customer No')['anomaly_label'].count()

# Sort customers by frequency of anomalies
frequent_anomalies = anomaly_frequencies.sort_values(ascending=False)

# Extract top 5 anomalous customers
top_customers = frequent_anomalies.head(5).index
top_customers_data = data_cleaned[data_cleaned['Customer No'].isin(top_customers)]

# Plot consumption trends for the top 5 anomalous customers
plt.figure(figsize=(12, 8))
for customer in top_customers:
    customer_data = top_customers_data[top_customers_data['Customer No'] == customer]
    plt.plot(postpaid_columns, customer_data[postpaid_columns].values.flatten(), label=f'Customer {customer} (Postpaid)', marker='o')
    plt.plot(prepaid_columns, customer_data[prepaid_columns].values.flatten(), label=f'Customer {customer} (Prepaid)', marker='x')

plt.title('Consumption Trends for Top Anomalous Customers (Postpaid vs Prepaid)')
plt.xlabel('Month')
plt.ylabel('Consumption (units)')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# Anomalies per month for postpaid and prepaid periods
anomalies_per_month_postpaid = anomalous_data[postpaid_columns].notna().sum()
anomalies_per_month_prepaid = anomalous_data[prepaid_columns].notna().sum()

# Plotting the number of anomalies per month
plt.figure(figsize=(10, 6))
plt.bar(postpaid_columns, anomalies_per_month_postpaid, label='Postpaid Anomalies', alpha=0.7, color='blue')
plt.bar(prepaid_columns, anomalies_per_month_prepaid, label='Prepaid Anomalies', alpha=0.7, color='orange')
plt.xticks(rotation=45)
plt.title('Number of Anomalies Detected Per Month (Postpaid vs Prepaid)')
plt.xlabel('Month')
plt.ylabel('Number of Anomalies')
plt.legend()
plt.tight_layout()
plt.show()

# Save anomalies to a CSV file (optional)
anomalous_data.to_csv('anomalies_detected.csv', index=False)


In [None]:
# Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
import seaborn as sns

# Load your dataset (raw or pre-processed data)
file_path = 'dataset.csv'  # Update this with the correct path to your dataset
data = pd.read_csv(file_path, low_memory=False)

# Define postpaid and prepaid column names
postpaid_columns = [col for col in data.columns if 'post_' in col]
prepaid_columns = [col for col in data.columns if 'pre_' in col]

# Convert relevant columns to numeric and handle errors
data[postpaid_columns + prepaid_columns] = data[postpaid_columns + prepaid_columns].apply(pd.to_numeric, errors='coerce')

# Drop rows with excessive NaNs (optional thresholding to remove rows with too many missing values)
threshold = int(len(postpaid_columns + prepaid_columns) * 0.5)
data_cleaned = data.dropna(thresh=threshold)

# Select consumption-related columns for anomaly detection (postpaid and prepaid months)
consumption_data = data_cleaned[postpaid_columns + prepaid_columns]

# Handle missing values by filling with the median value
consumption_data = consumption_data.fillna(consumption_data.median())

# --- Step 1: Normal Isolation Forest ---
# Initialize Isolation Forest model
iso_forest_normal = IsolationForest(contamination=0.05, random_state=42)

# Fit the model to the consumption data (without smoothing)
iso_forest_normal.fit(consumption_data)

# Predict anomalies (-1 indicates anomaly, 1 indicates normal)
anomaly_labels_normal = iso_forest_normal.predict(consumption_data)

# Calculate anomaly scores (lower scores are more likely to be anomalies)
anomaly_scores_normal = iso_forest_normal.decision_function(consumption_data)

# Add anomaly labels and scores to the cleaned dataset
data_cleaned['anomaly_label_normal'] = anomaly_labels_normal
data_cleaned['anomaly_score_normal'] = anomaly_scores_normal

# Summarize the number of anomalies detected
num_anomalies_normal = sum(anomaly_labels_normal == -1)
print(f"Number of anomalies detected (normal): {num_anomalies_normal}")

# Distribution of anomaly scores
print(data_cleaned['anomaly_score_normal'].describe())

# --- Step 2: EMA + Isolation Forest ---
# Apply Exponential Moving Average (EMA) for smoothing
ema_window = 7  # Increase the window size for a more aggressive smoothing
smoothed_data = consumption_data.apply(lambda x: x.ewm(span=ema_window, adjust=False).mean())

# Initialize Isolation Forest model for smoothed data
iso_forest_ema = IsolationForest(contamination=0.05, random_state=42)

# Fit the model to the EMA-smoothed consumption data
iso_forest_ema.fit(smoothed_data)

# Predict anomalies (-1 indicates anomaly, 1 indicates normal)
anomaly_labels_ema = iso_forest_ema.predict(smoothed_data)

# Calculate anomaly scores (lower scores are more likely to be anomalies)
anomaly_scores_ema = iso_forest_ema.decision_function(smoothed_data)

# Add anomaly labels and scores to the cleaned dataset
data_cleaned['anomaly_label_ema'] = anomaly_labels_ema
data_cleaned['anomaly_score_ema'] = anomaly_scores_ema

# Summarize the number of anomalies detected
num_anomalies_ema = sum(anomaly_labels_ema == -1)
print(f"Number of anomalies detected (EMA): {num_anomalies_ema}")

# Distribution of anomaly scores
print(data_cleaned['anomaly_score_ema'].describe())

# --- Step 3: Comparison of Results ---
# Compare the number of anomalies detected in both approaches
print(f"Anomalies detected (normal): {num_anomalies_normal}, Anomalies detected (EMA): {num_anomalies_ema}")

# --- Step 4: Customer-Level Analysis ---
# Identify customers who are flagged as anomalous most frequently (EMA)
anomalous_data_ema = data_cleaned[data_cleaned['anomaly_label_ema'] == -1]
anomaly_frequencies_ema = anomalous_data_ema.groupby('Customer No')['anomaly_label_ema'].count()

# Sort customers by frequency of anomalies (EMA)
frequent_anomalies_ema = anomaly_frequencies_ema.sort_values(ascending=False)

# Extract top 5 anomalous customers (EMA)
top_customers_ema = frequent_anomalies_ema.head(5).index
top_customers_data_ema = data_cleaned[data_cleaned['Customer No'].isin(top_customers_ema)]

# Plot consumption trends for the top 5 anomalous customers (EMA)
plt.figure(figsize=(12, 8))
for customer in top_customers_ema:
    customer_data = top_customers_data_ema[top_customers_data_ema['Customer No'] == customer]
    plt.plot(postpaid_columns, customer_data[postpaid_columns].values.flatten(), label=f'Customer {customer} (Postpaid)', marker='o')
    plt.plot(prepaid_columns, customer_data[prepaid_columns].values.flatten(), label=f'Customer {customer} (Prepaid)', marker='x')

plt.title('Consumption Trends for Top Anomalous Customers (Postpaid vs Prepaid)')
plt.xlabel('Month')
plt.ylabel('Consumption (units)')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

# --- Step 5: Monthly Anomalies (EMA) ---
anomalies_per_month_postpaid_ema = anomalous_data_ema[postpaid_columns].notna().sum()
anomalies_per_month_prepaid_ema = anomalous_data_ema[prepaid_columns].notna().sum()

# Plotting the number of anomalies per month (EMA)
plt.figure(figsize=(10, 6))
plt.bar(postpaid_columns, anomalies_per_month_postpaid_ema, label='Postpaid Anomalies (EMA)', alpha=0.7, color='blue')
plt.bar(prepaid_columns, anomalies_per_month_prepaid_ema, label='Prepaid Anomalies (EMA)', alpha=0.7, color='orange')
plt.xticks(rotation=45)
plt.title('Number of Anomalies Detected Per Month (Postpaid vs Prepaid, EMA)')
plt.xlabel('Month')
plt.ylabel('Number of Anomalies')
plt.legend()
plt.tight_layout()
plt.show()

# --- Step 6: Save Anomalies to CSV (Optional) ---
anomalous_data_ema.to_csv('anomalies_detected_ema.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

# Load your dataset
file_path = 'path_to_your_dataset.csv'
data = pd.read_csv(file_path, low_memory=False)

# Define postpaid and prepaid column names
postpaid_columns = [col for col in data.columns if 'post_' in col]
prepaid_columns = [col for col in data.columns if 'pre_' in col]

# Convert relevant columns to numeric and handle errors
data[postpaid_columns + prepaid_columns] = data[postpaid_columns + prepaid_columns].apply(pd.to_numeric, errors='coerce')

# Drop rows with excessive NaNs (optional)
threshold = int(len(postpaid_columns + prepaid_columns) * 0.5)
data_cleaned = data.dropna(thresh=threshold)

# Ensure no missing values
data_cleaned = data_cleaned.fillna(method='ffill').fillna(method='bfill')

# --- Step 1: First-Order Differencing ---
# Apply differencing to remove trends and focus on changes
diff_data = data_cleaned[postpaid_columns + prepaid_columns].diff().dropna()

# --- Step 2: Z-Score Normalization ---
# Apply Z-score normalization to the differenced data
mean_values = diff_data.mean()
std_values = diff_data.std()

z_score_data = (diff_data - mean_values) / std_values

# --- Step 3: Apply Isolation Forest on Z-Score Normalized Data ---
# Initialize Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fit the model to the Z-score normalized data
iso_forest.fit(z_score_data)

# Predict anomalies (-1 indicates anomaly, 1 indicates normal)
anomaly_labels = iso_forest.predict(z_score_data)

# Calculate anomaly scores (lower scores are more likely to be anomalies)
anomaly_scores = iso_forest.decision_function(z_score_data)

# Add results to the cleaned data
data_cleaned = data_cleaned.iloc[1:]  # Adjust index after differencing
data_cleaned['anomaly_label'] = anomaly_labels
data_cleaned['anomaly_score'] = anomaly_scores

# --- Step 4: Print Summary Statistics ---
print("Z-Score Normalization + Isolation Forest:")
print(f"Number of anomalies detected (Z-score): {sum(anomaly_labels == -1)}")
print(data_cleaned['anomaly_score'].describe())

# Save results
data_cleaned.to_csv('anomalies_detected_zscore.csv', index=False)

# --- Optional: Compare with Normal or EMA results (if applicable) ---
# Here you can print comparisons between normal, EMA, and Z-score as needed.
# For example, if you have the results of the previous normal or EMA models stored, you can compare them like this:

# Example of printing multiple outputs like before:
print("\nComparing Anomaly Detection Methods:")
print(f"Number of anomalies detected (normal): {num_anomalies_normal}")
print(f"Number of anomalies detected (EMA): {num_anomalies_ema}")
print(f"Number of anomalies detected (Z-score): {sum(anomaly_labels == -1)}")

# Summary statistics for comparison (assuming anomaly_score_normal and anomaly_score_ema exist)
print("\nSummary Statistics for Normal:")
print(data_cleaned['anomaly_score_normal'].describe())
print("\nSummary Statistics for EMA:")
print(data_cleaned['anomaly_score_ema'].describe())
print("\nSummary Statistics for Z-Score Normalization:")
print(data_cleaned['anomaly_score'].describe())


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

# Load your dataset
file_path = 'dataset.csv'
data = pd.read_csv(file_path, low_memory=False)

# Define postpaid and prepaid column names
postpaid_columns = [col for col in data.columns if 'post_' in col]
prepaid_columns = [col for col in data.columns if 'pre_' in col]

# Convert relevant columns to numeric and handle errors
data[postpaid_columns + prepaid_columns] = data[postpaid_columns + prepaid_columns].apply(pd.to_numeric, errors='coerce')

# Drop rows with excessive NaNs (optional)
threshold = int(len(postpaid_columns + prepaid_columns) * 0.5)
data_cleaned = data.dropna(thresh=threshold)

# Ensure no missing values
data_cleaned = data_cleaned.fillna(method='ffill').fillna(method='bfill')

# --- Step 1: First-Order Differencing ---
# Apply differencing to remove trends and focus on changes
diff_data = data_cleaned[postpaid_columns + prepaid_columns].diff().dropna()

# --- Step 2: Z-Score Normalization ---
# Apply Z-score normalization to the differenced data
mean_values = diff_data.mean()
std_values = diff_data.std()

z_score_data = (diff_data - mean_values) / std_values

# --- Step 3: Apply Isolation Forest on Z-Score Normalized Data ---
# Initialize Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fit the model to the Z-score normalized data
iso_forest.fit(z_score_data)

# Predict anomalies (-1 indicates anomaly, 1 indicates normal)
anomaly_labels = iso_forest.predict(z_score_data)

# Calculate anomaly scores (lower scores are more likely to be anomalies)
anomaly_scores = iso_forest.decision_function(z_score_data)

# Add results to the cleaned data
data_cleaned = data_cleaned.iloc[1:]  # Adjust index after differencing
data_cleaned['anomaly_label'] = anomaly_labels
data_cleaned['anomaly_score'] = anomaly_scores

# --- Step 4: Visualize Anomaly Scores ---
plt.figure(figsize=(12, 6))
plt.plot(data_cleaned['anomaly_score'])
plt.title("Anomaly Scores After Z-Score Normalization + Isolation Forest")
plt.show()

# Summarize the number of anomalies detected
num_anomalies = sum(anomaly_labels == -1)
print(f"Number of anomalies detected: {num_anomalies}")

# Save results
data_cleaned.to_csv('anomalies_detected_zscore.csv', index=False)
