## Question No: 1

In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Set random seed for reproducibility
np.random.seed(42)

# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto')
X, y = mnist["data"], mnist["target"].astype(np.uint8)
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# Scale pixel values to [0, 1]
X_train_scaled = X_train / 255.0
X_test_scaled = X_test / 255.0

# Define hyperparameter grid
param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}

# Initialize KNN classifier
knn_clf = KNeighborsClassifier()

# Perform grid search
grid_search = GridSearchCV(knn_clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Train best model on full training set
best_knn = grid_search.best_estimator_
best_knn.fit(X_train_scaled, y_train)

# Evaluate on test set
y_pred_knn = best_knn.predict(X_test_scaled)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("KNN Test Accuracy:", accuracy_knn)

## Question No: 2

In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from scipy.ndimage import shift
import matplotlib.pyplot as plt

# Set random seed
np.random.seed(42)

# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto')
X, y = mnist["data"], mnist["target"].astype(np.uint8)
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
X_train_scaled = X_train / 255.0
X_test_scaled = X_test / 255.0

# Function to shift image by x, y pixels
def shift_image(image, x, y):
    return shift(image.reshape(28, 28), [y, x]).flatten()

# Augment training set
X_train_augmented = X_train_scaled.copy()
y_train_augmented = y_train.copy()
for x, y in [(1, 0), (-1, 0), (0, 1), (0, -1)]:  # Right, left, down, up
    for image, label in zip(X_train_scaled, y_train):
        X_train_augmented = np.vstack([X_train_augmented, shift_image(image, x, y)])
        y_train_augmented = np.append(y_train_augmented, label)

print("Augmented Training Set Shape:", X_train_augmented.shape)  # (300000, 784)

# Train best KNN model (from Exercise 1: n_neighbors=3, weights='distance')
knn_clf = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn_clf.fit(X_train_augmented, y_train_augmented)

# Evaluate on test set
y_pred_aug_knn = knn_clf.predict(X_test_scaled)
accuracy_aug_knn = accuracy_score(y_test, y_pred_aug_knn)
print("Augmented KNN Test Accuracy:", accuracy_aug_knn)

# Visualize a few augmented images
for i in range(5):
    plt.imshow(X_train_augmented[i].reshape(28, 28), cmap='gray')
    plt.title(f"Augmented Digit: {y_train_augmented[i]}")
    plt.axis('off')
    plt.show()

## Question No: 3

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Note: Download Titanic dataset from https://www.kaggle.com/c/titanic
# Or use Kaggle API: kaggle competitions download -c titanic
# Place train.csv and test.csv in working directory

# Load dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocessing
def preprocess_titanic(df):
    # Select features
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    X = df[features].copy()
    
    # Encode categorical variables
    X['Sex'] = X['Sex'].map({'male': 0, 'female': 1})
    X = pd.get_dummies(X, columns=['Embarked'], drop_first=True)
    
    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    X[['Age', 'Fare']] = imputer.fit_transform(X[['Age', 'Fare']])
    
    return X

# Prepare training data
X = preprocess_titanic(train_df)
y = train_df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_scaled, y_train)
y_pred_rf = rf_clf.predict(X_val_scaled)
print("Random Forest Validation Accuracy:", accuracy_score(y_val, y_pred_rf))

# Train Logistic Regression
lr_clf = LogisticRegression(random_state=42)
lr_clf.fit(X_train_scaled, y_train)
y_pred_lr = lr_clf.predict(X_val_scaled)
print("Logistic Regression Validation Accuracy:", accuracy_score(y_val, y_pred_lr))

# Prepare test set predictions (for Kaggle submission)
X_test = preprocess_titanic(test_df)
X_test_scaled = scaler.transform(X_test)
y_test_pred = rf_clf.predict(X_test_scaled)

# Save submission
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': y_test_pred})
submission.to_csv('titanic_submission.csv', index=False)
print("Submission file saved as 'titanic_submission.csv'")

## Question No: 4

In [None]:
import os
import email
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import re

# Note: Download SpamAssassin datasets from https://spamassassin.apache.org/old/publiccorpus/
# Place in 'spam' and 'ham' folders in working directory

# Function to extract email body
def extract_email_body(file_path):
    with open(file_path, 'r', encoding='latin1') as f:
        msg = email.message_from_file(f)
        body = ""
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == 'text/plain':
                    body += part.get_payload(decode=True).decode('latin1', errors='ignore')
        else:
            body = msg.get_payload(decode=True).decode('latin1', errors='ignore')
        # Clean text: lowercase, remove punctuation, numbers
        body = re.sub(r'[^\w\s]', '', body.lower())
        body = re.sub(r'\d+', '', body)
    return body

# Load dataset
spam_files = [os.path.join('spam', f) for f in os.listdir('spam') if f.endswith('.txt')]
ham_files = [os.path.join('ham', f) for f in os.listdir('ham') if f.endswith('.txt')]

# Extract email bodies
emails = []
labels = []
for file in spam_files[:500] + ham_files[:500]:  # Limit for speed
    emails.append(extract_email_body(file))
    labels.append(1 if 'spam' in file else 0)

# Convert to DataFrame
df = pd.DataFrame({'email': emails, 'label': labels})

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['email'], df['label'], test_size=0.2, random_state=42)

# Create feature vectors
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naive Bayes
nb_clf = MultinomialNB()
nb_clf.fit(X_train_vec, y_train)
y_pred_nb = nb_clf.predict(X_test_vec)
print("Naive Bayes - Precision:", precision_score(y_test, y_pred_nb))
print("Naive Bayes - Recall:", recall_score(y_test, y_pred_nb))
print("Naive Bayes - F1 Score:", f1_score(y_test, y_pred_nb))

# Train SVM
svm_clf = SVC(kernel='linear', random_state=42)
svm_clf.fit(X_train_vec, y_train)
y_pred_svm = svm_clf.predict(X_test_vec)
print("SVM - Precision:", precision_score(y_test, y_pred_svm))
print("SVM - Recall:", recall_score(y_test, y_pred_svm))
print("SVM - F1 Score:", f1_score(y_test, y_pred_svm))

# Train Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_vec, y_train)
y_pred_rf = rf_clf.predict(X_test_vec)
print("Random Forest - Precision:", precision_score(y_test, y_pred_rf))
print("Random Forest - Recall:", recall_score(y_test, y_pred_rf))
print("Random Forest - F1 Score:", f1_score(y_test, y_pred_rf))