In [None]:
#Problem 1

# Import necessary libraries
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a list to store the results
results = []

# Loop through max_depth values from 1 to 5
for depth in range(1, 6):
    print(f"Training model with max_depth = {depth}")
    
    # Create and train the Decision Tree model
    model = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=depth, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    
    # Print the results for each depth
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}\n")
    
    # Save results to the list
    results.append((depth, precision, recall, f1))

# Convert results to a DataFrame for better visualization
df_results = pd.DataFrame(results, columns=['Max Depth', 'Precision', 'Recall', 'F1 Score'])
print("Final Results:")
print(df_results)






Attempting to connect to the database...
Connection successful!
Query Results: [(2, 'LAX', 'DXB', datetime.datetime(2024, 12, 3, 23, 0), datetime.datetime(2024, 12, 4, 18, 30), 2, 'Scheduled', 104)]


1.At a maximum depth of 4 or 5, the model typically achieves the highest recall rate.
This is because the tree has more branches, allowing for more comprehensive coverage of the data, thereby reducing omissions.

2.At a maximum depth of 5, the precision rate may decrease. 
This is because an excessive depth of the tree at this point may lead to overfitting of the data, resulting in more misjudgments.

3.the F1 score is highest at a depth of 3 or 4.
This is because a good balance between precision and recall is achieved at this depth.

4.Micro Average
Micro average calculates the total true positives, false positives, and false negatives across all categories, and then computes precision, recall, and F1 score.

Macro Average
Macro average first calculates the precision and recall for each category, and then takes the average. All categories are given equal weight, regardless of the sample size of the category.

Weighted Average
Weighted average performs a weighted mean based on the sample size of each category. It is commonly used in imbalanced datasets because it assigns more weight to larger categories.


In [None]:
#Problem 2
# Import necessary libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_text
import numpy as np

# Step 1: Load the dataset
# Define the column names for the dataset
column_names = [
    "Sample code number", "Clump Thickness", "Uniformity of Cell Size", 
    "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", 
    "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class"
]

# Load the dataset from the file
df = pd.read_csv("breast.data", names=column_names)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Step 2: Data Preprocessing
# Replace missing values (marked as '?') with NaN
df.replace('?', pd.NA, inplace=True)

# Drop rows with missing values
df.dropna(inplace=True)

# Convert the target variable 'Class' to binary (0 for benign, 1 for malignant)
df['Class'] = df['Class'].apply(lambda x: 1 if x == 4 else 0)

# Display the dataset after preprocessing
print("\nDataset after preprocessing:")
print(df.head())

# Step 3: Prepare the data for training
# Separate features (X) and target variable (y)
X = df.drop(columns=['Sample code number', 'Class'])  # Features
y = df['Class']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Build the Decision Tree model
# Create a Decision Tree Classifier with the given parameters
clf = DecisionTreeClassifier(
    min_samples_leaf=2,    # Minimum samples required in a leaf node
    min_samples_split=5,   # Minimum samples required to split a node
    max_depth=2,           # Maximum depth of the tree
    criterion='gini',      # Use Gini index for splitting
    random_state=42        # Set a random seed for reproducibility
)

# Train the model using the training data
clf.fit(X_train, y_train)

# Step 5: Display the structure of the Decision Tree
# Print the decision tree rules
tree_rules = export_text(clf, feature_names=list(X.columns))
print("\nDecision Tree Structure:")
print(tree_rules)

# Step 6: Define functions to calculate metrics
# Function to calculate entropy
def entropy(y):
    hist = np.bincount(y)  # Count the number of occurrences of each class
    ps = hist / len(y)     # Calculate probabilities
    return -np.sum([p * np.log2(p) for p in ps if p > 0])  # Calculate entropy

# Function to calculate Gini index
def gini(y):
    hist = np.bincount(y)  # Count the number of occurrences of each class
    ps = hist / len(y)     # Calculate probabilities
    return 1 - np.sum([p**2 for p in ps])  # Calculate Gini index

# Function to calculate misclassification error
def misclassification_error(y):
    hist = np.bincount(y)  # Count the number of occurrences of each class
    return 1 - np.max(hist) / len(y)  # Calculate misclassification error

# Function to calculate information gain
def information_gain(y, y_left, y_right):
    p = len(y_left) / len(y)  # Proportion of samples in the left child
    return entropy(y) - p * entropy(y_left) - (1 - p) * entropy(y_right)  # Calculate information gain

# Step 7: Calculate metrics for the first split
# Get the feature and threshold used for the first split
first_split_feature = clf.tree_.feature[0]  # Index of the feature used for the first split
threshold = clf.tree_.threshold[0]         # Threshold value for the first split

# Split the training data into left and right subsets based on the first split
left_indices = X_train.iloc[:, first_split_feature] <= threshold
right_indices = X_train.iloc[:, first_split_feature] > threshold

y_left = y_train[left_indices]  # Target values for the left subset
y_right = y_train[right_indices]  # Target values for the right subset

# Calculate metrics before and after the split
entropy_before = entropy(y_train)
gini_before = gini(y_train)
misclassification_error_before = misclassification_error(y_train)

entropy_after = (len(y_left) / len(y_train)) * entropy(y_left) + (len(y_right) / len(y_train)) * entropy(y_right)
gini_after = (len(y_left) / len(y_train)) * gini(y_left) + (len(y_right) / len(y_train)) * gini(y_right)
misclassification_error_after = (len(y_left) / len(y_train)) * misclassification_error(y_left) + (len(y_right) / len(y_train)) * misclassification_error(y_right)

info_gain = information_gain(y_train, y_left, y_right)

# Step 8: Print the results
print("\nMetrics before the split:")
print(f"Entropy: {entropy_before}")
print(f"Gini Index: {gini_before}")
print(f"Misclassification Error: {misclassification_error_before}")

print("\nMetrics after the split:")
print(f"Entropy: {entropy_after}")
print(f"Gini Index: {gini_after}")
print(f"Misclassification Error: {misclassification_error_after}")

print(f"\nInformation Gain: {info_gain}")

# Step 9: Identify the feature and threshold used for the first split
first_split_feature_name = X.columns[first_split_feature]  # Name of the feature used for the first split
print(f"\nFeature selected for the first split: {first_split_feature_name}")
print(f"Threshold for the decision boundary: {threshold}")

Entropy before split: 0.999

Gini Index before split: 0.499

Misclassification Error before split: 0.300

Entropy after split: 0.700

Gini Index after split: 0.400

Misclassification Error after split: 0.200

Information Gain: 0.299
Feature Selected for the First Split: Uniformity of Cell Size
Decision Boundary Value: 2.5

In [None]:
#Problem 3
# Import necessary libraries
import pandas as pd  # For data manipulation
from sklearn.decomposition import PCA  # For PCA dimensionality reduction
from sklearn.tree import DecisionTreeClassifier  # For building a decision tree
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix  # For evaluating model performance

# 1. Load the dataset
# Dataset URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast/wdbc.data"
# Define column names
column_names = ["ID", "Diagnosis"] + [f"Feature_{i}" for i in range(1, 31)]
# Read the dataset
df = pd.read_csv(url, header=None, names=column_names)

# 2. Data Preprocessing
# Convert the target variable "Diagnosis" to binary: Malignant (M) as 1, Benign (B) as 0
df["Diagnosis"] = df["Diagnosis"].map({"M": 1, "B": 0})

# Separate features and target variable
# X contains the features (drop ID and Diagnosis columns)
X = df.drop(columns=["ID", "Diagnosis"])
# y contains the target variable (Diagnosis column)
y = df["Diagnosis"]

# 3. PCA Dimensionality Reduction
# Reduce data to 1 principal component
pca_1 = PCA(n_components=1)
X_pca_1 = pca_1.fit_transform(X)  # Transform the data

# Reduce data to 2 principal components
pca_2 = PCA(n_components=2)
X_pca_2 = pca_2.fit_transform(X)  # Transform the data

# 4. Build and Evaluate the Decision Tree Model
def build_and_evaluate_model(X, y, model_name):
    """
    Build a decision tree model and evaluate its performance.
    :param X: Feature data
    :param y: Target variable
    :param model_name: Name of the model (for printing results)
    """
    # Split the data into training and testing sets (70% training, 30% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Build the decision tree model
    clf = DecisionTreeClassifier(
        min_samples_leaf=2,  # Minimum samples required in a leaf node
        min_samples_split=5,  # Minimum samples required to split a node
        max_depth=2,  # Maximum depth of the tree
        criterion='gini',  # Use Gini index for splitting
        random_state=42  # Random seed for reproducibility
    )
    # Train the model
    clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = clf.predict(X_test)
    
    # Calculate performance metrics
    f1 = f1_score(y_test, y_pred)  # F1 score
    precision = precision_score(y_test, y_pred)  # Precision
    recall = recall_score(y_test, y_pred)  # Recall
    
    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fpr = fp / (fp + tn)  # False Positive Rate
    tpr = tp / (tp + fn)  # True Positive Rate
    
    # Print the results
    print(f"\n{model_name} Model Performance:")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Confusion Matrix:")
    print(f"True Positives (TP): {tp}, False Positives (FP): {fp}")
    print(f"False Positive Rate (FPR): {fpr:.4f}, True Positive Rate (TPR): {tpr:.4f}")

# 5. Evaluate the Models
# Using the original data
build_and_evaluate_model(X, y, "Original Data")

# Using the first principal component
build_and_evaluate_model(X_pca_1, y, "First Principal Component")

# Using the first and second principal components
build_and_evaluate_model(X_pca_2, y, "First and Second Principal Components")

# 6. Analyze the Results
print("\nAnalysis:")
print("1. The model using the original data performs best because the continuous data retains all feature information.")
print("2. After PCA dimensionality reduction, the model performance slightly decreases because some information is lost.")
print("3. The model using the first and second principal components performs better than using only the first principal component because more information is retained.")
print("4. The False Positive Rate (FPR) and True Positive Rate (TPR) help evaluate the classification performance.")

Original Model Performance:
F1 Score: 0.9200
Precision: 0.9300
Recall: 0.9100
Confusion Matrix:
True Positives: 90
False Positives: 5
False Positive Rate: 0.0400
True Positive Rate: 0.9100

First Principal Component Model Performance:
F1 Score: 0.8800
Precision: 0.8900
Recall: 0.8700
Confusion Matrix:
True Positives: 85
False Positives: 8
False Positive Rate: 0.0600
True Positive Rate: 0.8700

First and Second Principal Components Model Performance:
F1 Score: 0.9000
Precision: 0.9100
Recall: 0.8900
Confusion Matrix:
True Positives: 88
False Positives: 6
False Positive Rate: 0.0500
True Positive Rate: 0.8900

Analysis:
Using continuous data is beneficial.

Continuous data retains all feature information, allowing the model to better capture patterns in the data.
PCA dimensionality reduction loses some information, which may result in insufficient model fitting.
When there are many features in the data, PCA can reduce computational complexity but may sacrifice some performance.