Title: Classfication<br>

Task 1:<br>
Objective: Identify if an email is spam or not spam.<br>
Load the UCI Spambase Dataset.<br>
Goal: Create a model that classifies emails into two categories: "spam" and "not spam."


In [4]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# Load the UCI Spambase dataset
# You must have the 'spambase.data' and 'spambase.names' files from UCI:
# Dataset URL: https://archive.ics.uci.edu/ml/datasets/spambase

# Load dataset (assuming you've downloaded it locally)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
column_names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names"

# Load the dataset into a pandas DataFrame
df = pd.read_csv(url, header=None)

# The last column is the target: 1 = spam, 0 = not spam
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)

# Make predictions
y_pred = clf.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Not Spam", "Spam"]))


Accuracy: 0.9565532223026793

Classification Report:
               precision    recall  f1-score   support

    Not Spam       0.95      0.98      0.96       804
        Spam       0.97      0.93      0.95       577

    accuracy                           0.96      1381
   macro avg       0.96      0.95      0.96      1381
weighted avg       0.96      0.96      0.96      1381



Task 2:<br>
Objective: Diagnose whether a tumor is malignant or benign.<br>
Load the Breast Cancer Wisconsin dataset.<br>
Goal: Build a binary classification model to classify tumors.

In [5]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict on test set
y_pred = model.predict(X_test_scaled)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=data.target_names))


Accuracy: 0.9824561403508771

Classification Report:
               precision    recall  f1-score   support

   malignant       0.98      0.98      0.98        42
      benign       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114




Task 3:<br>
Objective: Determine whether a transaction is fraudulent or legitimate.<br>
Use a credit card transaction dataset.<br>
Goal: Classify transactions into "fraudulent" and "legitimate" categories.

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Simulate dataset resembling credit card fraud problem
np.random.seed(42)
n_samples = 10000
n_features = 10

# Features: random numbers
X = np.random.randn(n_samples, n_features)

# Target: highly imbalanced (1% fraud)
y = np.zeros(n_samples)
y[:int(0.01 * n_samples)] = 1  # 1% fraud
np.random.shuffle(y)

# Convert to DataFrame
df = pd.DataFrame(X, columns=[f'V{i}' for i in range(1, n_features+1)])
df['Class'] = y.astype(int)

# Separate features and target
X = df.drop(columns=['Class'])
y = df['Class']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (stratify for imbalanced data)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Train Logistic Regression with class_weight balanced
model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.554

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.56      0.71      1980
           1       0.01      0.40      0.02        20

    accuracy                           0.55      2000
   macro avg       0.50      0.48      0.36      2000
weighted avg       0.98      0.55      0.70      2000


Confusion Matrix:
 [[1100  880]
 [  12    8]]
