In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from matplotlib.backends.backend_pdf import PdfPages
import os

# Define paths for saving the plots and results
output_path = r'C:\Users\mbpd1\downloads\upgrad\capstone\anom\reports'
vis_path = r"C:\Users\mbpd1\downloads\upgrad\capstone\anom\src\visualization"
os.makedirs(output_path, exist_ok=True)
pdf_file = os.path.join(output_path, 'EDA_and_Model_Evaluation.pdf')
results_file = os.path.join(output_path, 'model_results.txt')

# Load the preprocessed data
processed_data = pd.read_csv(r'C:\Users\mbpd1\downloads\upgrad\capstone\anom\data\processed\preprocessed_data.csv')

# Check if 'y' (target) column exists and handle potential issues
if 'y' not in processed_data.columns:
    raise ValueError("'y' (target) column not found in the dataset")

# Open a PDF to save all plots
with PdfPages(pdf_file) as pdf:
    
    # Exploratory Data Analysis (EDA)
    # Show correlation heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(processed_data.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    pdf.savefig()  # Save plot to the PDF
    plt.close()

    # Pairplot (only a sample if the dataset is large to avoid performance issues)
    # Note: If the dataset is too large, pairplot can be slow. Sampling 500 rows for visualization.
    pairplot_sample = processed_data.sample(n=500, random_state=42)
    sns.pairplot(pairplot_sample)
    plt.title('Pairplot of Features (Sampled)')
    pdf.savefig()  # Save plot to the PDF
    plt.close()

# Splitting the data into features (X) and target (y)
X = processed_data.drop('y', axis=1)
y = processed_data['y']

# Check for missing values in the target variable
if y.isnull().sum() > 0:
    print(f"Warning: There are {y.isnull().sum()} missing values in the target 'y'. These rows will be dropped.")
    processed_data = processed_data.dropna(subset=['y'])
    X = processed_data.drop('y', axis=1)
    y = processed_data['y']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

# Save results to a text file
with open(results_file, 'w') as f:
    f.write(f"Model Accuracy: {accuracy}\n")
    f.write("Classification Report:\n")
    f.write(f"{classification_rep}\n")
    f.write("Confusion Matrix:\n")
    f.write(f"{cm}\n")

# Plot and save Confusion Matrix to the PDF
with PdfPages(pdf_file) as pdf:
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(vis_path,'1.png')) # Save plot to the PDF
    plt.close()

print(f"Plots saved to {pdf_file}")
print(f"Model results saved to {results_file}")


Model Accuracy: 0.9972826086956522
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5491
           1       0.85      0.59      0.69        29

    accuracy                           1.00      5520
   macro avg       0.92      0.79      0.85      5520
weighted avg       1.00      1.00      1.00      5520

Plots saved to C:\Users\mbpd1\downloads\upgrad\capstone\anom\reports\EDA_and_Model_Evaluation.pdf
Model results saved to C:\Users\mbpd1\downloads\upgrad\capstone\anom\reports\model_results.txt
