# Machine Learning Lab Exam — Universal TemplatePaste your dataset, set the target column, and run all cells sequentially for a complete ML workflow.

## Environment Setup (Run Once)Run these commands in your terminal **before starting the notebook** to set up the required environment.```bashconda create -n ml_env python=3.10 -yconda activate ml_envpip install numpypip install pandaspip install matplotlibpip install seabornpip install scikit-learnpip install scipypip install pillowpip install tensorflow==2.12pip install keraspip install opencv-pythonpip install statsmodels```

## Section 1 — Common Imports & Utils

In [ ]:
# Universal Importsimport numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoderfrom sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, silhouette_scorefrom sklearn.impute import SimpleImputerimport warningswarnings.filterwarnings('ignore')# Helper: Automatic train/test splitdef auto_train_test_split(X, y, test_size=0.2, stratify=None, random_state=42):    if stratify is not None:        return train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)    else:        return train_test_split(X, y, test_size=test_size, random_state=random_state)# Helper: Scalingdef scale_features(X_train, X_test):    scaler = StandardScaler()    X_train_scaled = scaler.fit_transform(X_train)    X_test_scaled = scaler.transform(X_test)    return X_train_scaled, X_test_scaled, scaler# Helper: Encoding categoricalsdef encode_categoricals(df, exclude=[]):    df = df.copy()    label_encoders = {}    for col in df.select_dtypes(include=['object', 'category']).columns:        if col not in exclude:            le = LabelEncoder()            df[col] = le.fit_transform(df[col].astype(str))            label_encoders[col] = le    return df, label_encoders# Helper: Plot confusion matrixdef plot_confusion(y_true, y_pred, labels=None, title='Confusion Matrix'):    cm = confusion_matrix(y_true, y_pred)    plt.figure(figsize=(5,4))    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)    plt.xlabel('Predicted')    plt.ylabel('Actual')    plt.title(title)    plt.show()# Helper: EDA summarydef eda_summary(df):    print('Shape:', df.shape)    print('\nInfo:')    print(df.info())    print('\nDescribe:')    print(df.describe(include='all'))# Helper: Print metricsdef print_classification_metrics(y_true, y_pred):    print('Accuracy:', accuracy_score(y_true, y_pred))    print('Precision:', precision_score(y_true, y_pred, average='weighted', zero_division=0))    print('Recall:', recall_score(y_true, y_pred, average='weighted', zero_division=0))    print('F1 Score:', f1_score(y_true, y_pred, average='weighted', zero_division=0))def print_regression_metrics(y_true, y_pred):    mse = mean_squared_error(y_true, y_pred)    rmse = np.sqrt(mse)    r2 = r2_score(y_true, y_pred)    print(f'MSE: {mse:.4f}')    print(f'RMSE: {rmse:.4f}')    print(f'R^2: {r2:.4f}')

## Section 2 — Data Loading Template

In [ ]:
# Load your dataset heredf = pd.read_csv('your_dataset.csv')  # Change filename as neededTARGET_COLUMN = 'target'  # Change to your target column name# Detect feature typesfeature_cols = [col for col in df.columns if col != TARGET_COLUMN]numeric_features = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()categorical_features = df[feature_cols].select_dtypes(include=['object', 'category']).columns.tolist()# Handle missing valuesfor col in numeric_features:    df[col] = df[col].fillna(df[col].median())for col in categorical_features:    df[col] = df[col].fillna(df[col].mode()[0])# Encode categoricals (excluding target)df_encoded, label_encoders = encode_categoricals(df, exclude=[TARGET_COLUMN])# Encode target if categoricalif df[TARGET_COLUMN].dtype == 'object' or str(df[TARGET_COLUMN].dtype)=='category':    target_le = LabelEncoder()    df_encoded[TARGET_COLUMN] = target_le.fit_transform(df[TARGET_COLUMN].astype(str))else:    target_le = None# Scale numeric featuresX = df_encoded[feature_cols]y = df_encoded[TARGET_COLUMN]X_scaled = X.copy()if len(numeric_features) > 0:    scaler = StandardScaler()    X_scaled[numeric_features] = scaler.fit_transform(X[numeric_features])print('Numeric features:', numeric_features)print('Categorical features:', categorical_features)print('Target:', TARGET_COLUMN)

## Section 3 — Exploratory Data Analysis (EDA)

In [ ]:
# EDA Summaryeda_summary(df)# Missing value heatmapplt.figure(figsize=(8,4))sns.heatmap(df.isnull(), cbar=False, cmap='viridis')plt.title('Missing Value Heatmap')plt.show()# Correlation heatmap (numeric only)if len(numeric_features) > 1:    plt.figure(figsize=(8,6))    sns.heatmap(df[numeric_features].corr(), annot=True, cmap='coolwarm')    plt.title('Correlation Heatmap')    plt.show()# Pairplot (numeric only)if len(numeric_features) > 1:    sns.pairplot(df, vars=numeric_features, hue=TARGET_COLUMN if len(df[TARGET_COLUMN].unique()) < 10 else None)    plt.show()# Target distributionplt.figure(figsize=(6,4))if df[TARGET_COLUMN].nunique() < 20:    sns.countplot(x=TARGET_COLUMN, data=df)else:    df[TARGET_COLUMN].hist(bins=20)plt.title('Target Distribution')plt.show()# Feature histogramsdf[feature_cols].hist(figsize=(12,8), bins=20)plt.tight_layout()plt.show()

### Section 4 — Linear RegressionPerforms simple or multiple linear regression automatically. Evaluates with MSE, RMSE, R², and plots actual vs predicted and residuals.

In [ ]:
from sklearn.linear_model import LinearRegression# Only use numeric target for regressionif np.issubdtype(df[TARGET_COLUMN].dtype, np.number):    X_train, X_test, y_train, y_test = auto_train_test_split(X_scaled, y, test_size=0.2)    lr = LinearRegression()    lr.fit(X_train, y_train)    y_pred = lr.predict(X_test)    print_regression_metrics(y_test, y_pred)    # Actual vs Predicted    plt.figure(figsize=(6,4))    plt.scatter(y_test, y_pred, alpha=0.7)    plt.xlabel('Actual')    plt.ylabel('Predicted')    plt.title('Actual vs Predicted')    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')    plt.show()    # Residual plot    residuals = y_test - y_pred    plt.figure(figsize=(6,4))    plt.scatter(y_pred, residuals, alpha=0.7)    plt.axhline(0, color='red', linestyle='--')    plt.xlabel('Predicted')    plt.ylabel('Residuals')    plt.title('Residual Plot')    plt.show()else:    print('Target is not numeric. Skipping Linear Regression.')

### Section 5 — Logistic RegressionPerforms binary or multiclass logistic regression. Evaluates with accuracy, precision, recall, F1, confusion matrix, and ROC curve (binary).

In [ ]:
from sklearn.linear_model import LogisticRegression# Only use categorical/discrete target for classificationif df[TARGET_COLUMN].nunique() >= 2 and df[TARGET_COLUMN].nunique() < 100:    X_train, X_test, y_train, y_test = auto_train_test_split(X_scaled, y, test_size=0.2, stratify=y)    clf = LogisticRegression(max_iter=1000, multi_class='auto')    clf.fit(X_train, y_train)    y_pred = clf.predict(X_test)    print_classification_metrics(y_test, y_pred)    plot_confusion(y_test, y_pred, labels=np.unique(y))    # ROC Curve (binary only)    if len(np.unique(y)) == 2:        y_score = clf.predict_proba(X_test)[:,1]        fpr, tpr, _ = roc_curve(y_test, y_score)        roc_auc = auc(fpr, tpr)        plt.figure(figsize=(6,4))        plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')        plt.plot([0,1],[0,1],'k--')        plt.xlabel('False Positive Rate')        plt.ylabel('True Positive Rate')        plt.title('ROC Curve')        plt.legend()        plt.show()else:    print('Target is not suitable for Logistic Regression.')

### Section 6 — Decision TreesTrains both ID3 (entropy) and CART (gini) decision trees. Supports depth control, accuracy, and tree visualization (if small).

In [ ]:
from sklearn.tree import DecisionTreeClassifier, plot_treeif df[TARGET_COLUMN].nunique() >= 2 and df[TARGET_COLUMN].nunique() < 100:    X_train, X_test, y_train, y_test = auto_train_test_split(X_scaled, y, test_size=0.2, stratify=y)    for crit in ['entropy', 'gini']:        print(f'\nDecision Tree ({crit.upper()})')        dt = DecisionTreeClassifier(criterion=crit, max_depth=5, random_state=42)        dt.fit(X_train, y_train)        y_pred = dt.predict(X_test)        print_classification_metrics(y_test, y_pred)        plot_confusion(y_test, y_pred, labels=np.unique(y), title=f'Decision Tree ({crit}) Confusion')        # Visualize tree if small        if X_train.shape[1] <= 10:            plt.figure(figsize=(12,6))            plot_tree(dt, feature_names=feature_cols, class_names=[str(c) for c in np.unique(y)], filled=True, max_depth=2)            plt.title(f'Decision Tree ({crit}) Visualization')            plt.show()else:    print('Target is not suitable for Decision Trees.')

### Section 7 — Random ForestTrains a random forest classifier, shows feature importances, confusion matrix, and accuracy.

In [ ]:
from sklearn.ensemble import RandomForestClassifierif df[TARGET_COLUMN].nunique() >= 2 and df[TARGET_COLUMN].nunique() < 100:    X_train, X_test, y_train, y_test = auto_train_test_split(X_scaled, y, test_size=0.2, stratify=y)    rf = RandomForestClassifier(n_estimators=100, random_state=42)    rf.fit(X_train, y_train)    y_pred = rf.predict(X_test)    print_classification_metrics(y_test, y_pred)    plot_confusion(y_test, y_pred, labels=np.unique(y), title='Random Forest Confusion')    # Feature importance    importances = rf.feature_importances_    indices = np.argsort(importances)[::-1]    plt.figure(figsize=(8,4))    plt.title('Feature Importances')    plt.bar(range(X.shape[1]), importances[indices], align='center')    plt.xticks(range(X.shape[1]), [feature_cols[i] for i in indices], rotation=90)    plt.tight_layout()    plt.show()else:    print('Target is not suitable for Random Forest.')

### Section 8 — K-Nearest Neighbors (KNN)Performs KNN classification with automatic K selection, accuracy vs K plot, and confusion matrix.

In [ ]:
from sklearn.neighbors import KNeighborsClassifierif df[TARGET_COLUMN].nunique() >= 2 and df[TARGET_COLUMN].nunique() < 100:    X_train, X_test, y_train, y_test = auto_train_test_split(X_scaled, y, test_size=0.2, stratify=y)    k_range = range(1, min(21, len(X_train)))    scores = []    for k in k_range:        knn = KNeighborsClassifier(n_neighbors=k)        knn.fit(X_train, y_train)        scores.append(knn.score(X_test, y_test))    # Plot accuracy vs K    plt.figure(figsize=(8,4))    plt.plot(k_range, scores, marker='o')    plt.xlabel('K')    plt.ylabel('Accuracy')    plt.title('KNN Accuracy vs K')    plt.show()    # Best K    best_k = k_range[np.argmax(scores)]    print(f'Best K: {best_k}')    knn = KNeighborsClassifier(n_neighbors=best_k)    knn.fit(X_train, y_train)    y_pred = knn.predict(X_test)    print_classification_metrics(y_test, y_pred)    plot_confusion(y_test, y_pred, labels=np.unique(y), title='KNN Confusion')else:    print('Target is not suitable for KNN.')

### Section 9 — Support Vector Machines (SVM)Trains SVM with linear, polynomial, and RBF kernels. Compares accuracies.

In [ ]:
from sklearn.svm import SVCif df[TARGET_COLUMN].nunique() >= 2 and df[TARGET_COLUMN].nunique() < 100:    X_train, X_test, y_train, y_test = auto_train_test_split(X_scaled, y, test_size=0.2, stratify=y)    kernels = ['linear', 'poly', 'rbf']    accs = []    for kernel in kernels:        print(f'\nSVM ({kernel} kernel)')        clf = SVC(kernel=kernel, probability=True, random_state=42)        clf.fit(X_train, y_train)        y_pred = clf.predict(X_test)        acc = accuracy_score(y_test, y_pred)        accs.append(acc)        print_classification_metrics(y_test, y_pred)        plot_confusion(y_test, y_pred, labels=np.unique(y), title=f'SVM ({kernel}) Confusion')    # Compare accuracies    plt.figure(figsize=(6,4))    plt.bar(kernels, accs)    plt.ylabel('Accuracy')    plt.title('SVM Kernel Comparison')    plt.show()else:    print('Target is not suitable for SVM.')

### Section 10 — Artificial Neural Network (ANN)Trains a single-layer perceptron and a multilayer ANN using Keras. Plots accuracy and loss curves.

In [ ]:
from tensorflow.keras.models import Sequentialfrom tensorflow.keras.layers import Dense, InputLayerfrom tensorflow.keras.utils import to_categoricalif df[TARGET_COLUMN].nunique() >= 2 and df[TARGET_COLUMN].nunique() < 100:    X_train, X_test, y_train, y_test = auto_train_test_split(X_scaled, y, test_size=0.2, stratify=y)    n_classes = len(np.unique(y))    # Prepare targets for Keras    if n_classes > 2:        y_train_cat = to_categorical(y_train)        y_test_cat = to_categorical(y_test)        out_units = n_classes        loss_fn = 'categorical_crossentropy'        activation = 'softmax'    else:        y_train_cat = y_train        y_test_cat = y_test        out_units = 1        loss_fn = 'binary_crossentropy'        activation = 'sigmoid'    # Single-layer perceptron    model1 = Sequential([        InputLayer(input_shape=(X_train.shape[1],)),        Dense(out_units, activation=activation)    ])    model1.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])    hist1 = model1.fit(X_train, y_train_cat, epochs=20, batch_size=32, validation_data=(X_test, y_test_cat), verbose=0)    # Multilayer ANN    model2 = Sequential([        InputLayer(input_shape=(X_train.shape[1],)),        Dense(64, activation='relu'),        Dense(32, activation='relu'),        Dense(out_units, activation=activation)    ])    model2.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])    hist2 = model2.fit(X_train, y_train_cat, epochs=30, batch_size=32, validation_data=(X_test, y_test_cat), verbose=0)    # Plot accuracy & loss    plt.figure(figsize=(12,5))    plt.subplot(1,2,1)    plt.plot(hist1.history['accuracy'], label='Single-layer')    plt.plot(hist2.history['accuracy'], label='Multilayer')    plt.xlabel('Epoch')    plt.ylabel('Accuracy')    plt.title('Training Accuracy')    plt.legend()    plt.subplot(1,2,2)    plt.plot(hist1.history['loss'], label='Single-layer')    plt.plot(hist2.history['loss'], label='Multilayer')    plt.xlabel('Epoch')    plt.ylabel('Loss')    plt.title('Training Loss')    plt.legend()    plt.show()else:    print('Target is not suitable for ANN.')

### Section 11 — K-Means ClusteringPerforms K-Means clustering with automatic K selection (elbow), cluster visualization, and silhouette score.

In [ ]:
from sklearn.cluster import KMeans# Use only numeric features for clusteringif len(numeric_features) > 1:    X_clust = X_scaled[numeric_features]    # Elbow method    sse = []    K_range = range(2, 11)    for k in K_range:        km = KMeans(n_clusters=k, random_state=42)        km.fit(X_clust)        sse.append(km.inertia_)    plt.figure(figsize=(6,4))    plt.plot(K_range, sse, marker='o')    plt.xlabel('K')    plt.ylabel('SSE')    plt.title('Elbow Method for K')    plt.show()    # Choose best K (elbow visually or set)    best_k = int(input('Enter optimal K (see elbow plot): '))    km = KMeans(n_clusters=best_k, random_state=42)    clusters = km.fit_predict(X_clust)    # Silhouette score    sil = silhouette_score(X_clust, clusters)    print(f'Silhouette Score: {sil:.3f}')    # Cluster visualization (first 2 features)    plt.figure(figsize=(6,4))    plt.scatter(X_clust.iloc[:,0], X_clust.iloc[:,1], c=clusters, cmap='tab10', alpha=0.7)    plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], c='red', marker='X', s=100)    plt.xlabel(numeric_features[0])    plt.ylabel(numeric_features[1])    plt.title('K-Means Clusters')    plt.show()else:    print('Not enough numeric features for K-Means.')

### Section 12 — K-Medoids ClusteringPerforms K-Medoids clustering (PAM), shows distance matrix and cluster plots.

In [ ]:
# K-Medoids using scikit-learn-extra (if available), else fallback to simple implementationtry:    from sklearn_extra.cluster import KMedoids    kmedoids_available = Trueexcept ImportError:    kmedoids_available = Falseif len(numeric_features) > 1:    X_clust = X_scaled[numeric_features]    if kmedoids_available:        best_k = int(input('Enter optimal K for K-Medoids: '))        kmed = KMedoids(n_clusters=best_k, random_state=42)        clusters = kmed.fit_predict(X_clust)        # Distance matrix        from scipy.spatial.distance import cdist        dist_matrix = cdist(X_clust, kmed.cluster_centers_)        plt.figure(figsize=(6,4))        sns.heatmap(dist_matrix, cmap='viridis')        plt.title('Distance Matrix to Medoids')        plt.show()        # Cluster plot        plt.figure(figsize=(6,4))        plt.scatter(X_clust.iloc[:,0], X_clust.iloc[:,1], c=clusters, cmap='tab10', alpha=0.7)        plt.scatter(kmed.cluster_centers_[:,0], kmed.cluster_centers_[:,1], c='red', marker='X', s=100)        plt.xlabel(numeric_features[0])        plt.ylabel(numeric_features[1])        plt.title('K-Medoids Clusters')        plt.show()    else:        print('scikit-learn-extra not installed. K-Medoids unavailable.')else:    print('Not enough numeric features for K-Medoids.')

### Section 13 — Hierarchical ClusteringPerforms hierarchical clustering (single, complete, average linkage), shows dendrogram and cluster visualization.

In [ ]:
from scipy.cluster.hierarchy import dendrogram, linkage, fclusterif len(numeric_features) > 1:    X_clust = X_scaled[numeric_features]    methods = ['single', 'complete', 'average']    for method in methods:        print(f'\nLinkage: {method}')        Z = linkage(X_clust, method=method)        plt.figure(figsize=(10,4))        dendrogram(Z, truncate_mode='lastp', p=20)        plt.title(f'Dendrogram ({method})')        plt.xlabel('Samples')        plt.ylabel('Distance')        plt.show()        # Choose number of clusters        k = int(input(f'Enter number of clusters for {method} linkage: '))        clusters = fcluster(Z, k, criterion='maxclust')        # Cluster plot (first 2 features)        plt.figure(figsize=(6,4))        plt.scatter(X_clust.iloc[:,0], X_clust.iloc[:,1], c=clusters, cmap='tab10', alpha=0.7)        plt.xlabel(numeric_features[0])        plt.ylabel(numeric_features[1])        plt.title(f'Hierarchical Clusters ({method})')        plt.show()else:    print('Not enough numeric features for Hierarchical Clustering.')

### Section 14 — Bayesian Networks (Naive Bayes)Trains Gaussian and Multinomial Naive Bayes classifiers, shows accuracy and confusion matrix.

In [ ]:
from sklearn.naive_bayes import GaussianNB, MultinomialNBif df[TARGET_COLUMN].nunique() >= 2 and df[TARGET_COLUMN].nunique() < 100:    X_train, X_test, y_train, y_test = auto_train_test_split(X_scaled, y, test_size=0.2, stratify=y)    print('Gaussian Naive Bayes:')    gnb = GaussianNB()    gnb.fit(X_train, y_train)    y_pred = gnb.predict(X_test)    print_classification_metrics(y_test, y_pred)    plot_confusion(y_test, y_pred, labels=np.unique(y), title='GaussianNB Confusion')    print('\nMultinomial Naive Bayes:')    mnb = MultinomialNB()    mnb.fit(np.abs(X_train), y_train)  # MultinomialNB requires non-negative    y_pred = mnb.predict(np.abs(X_test))    print_classification_metrics(y_test, y_pred)    plot_confusion(y_test, y_pred, labels=np.unique(y), title='MultinomialNB Confusion')else:    print('Target is not suitable for Naive Bayes.')

### Section 15 — Convolutional Neural Network (CNN)Generic CNN template for image folders using Keras ImageDataGenerator. Plots accuracy and loss curves.

In [ ]:
from tensorflow.keras.preprocessing.image import ImageDataGeneratorfrom tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout# Set these paths to your image folderstrain_dir = 'path_to_train_folder'  # e.g., './train'val_dir = 'path_to_val_folder'      # e.g., './val'# Parametersimg_height, img_width = 64, 64batch_size = 32datagen = ImageDataGenerator(rescale=1./255)train_gen = datagen.flow_from_directory(    train_dir, target_size=(img_height, img_width), batch_size=batch_size, class_mode='categorical')val_gen = datagen.flow_from_directory(    val_dir, target_size=(img_height, img_width), batch_size=batch_size, class_mode='categorical')num_classes = train_gen.num_classesmodel = Sequential([    Conv2D(32, (3,3), activation='relu', input_shape=(img_height, img_width, 3)),    MaxPooling2D(2,2),    Conv2D(64, (3,3), activation='relu'),    MaxPooling2D(2,2),    Flatten(),    Dense(128, activation='relu'),    Dropout(0.5),    Dense(num_classes, activation='softmax')])model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])history = model.fit(    train_gen,    epochs=10,    validation_data=val_gen)# Plot accuracy & lossplt.figure(figsize=(12,5))plt.subplot(1,2,1)plt.plot(history.history['accuracy'], label='Train')plt.plot(history.history['val_accuracy'], label='Val')plt.xlabel('Epoch')plt.ylabel('Accuracy')plt.title('CNN Accuracy')plt.legend()plt.subplot(1,2,2)plt.plot(history.history['loss'], label='Train')plt.plot(history.history['val_loss'], label='Val')plt.xlabel('Epoch')plt.ylabel('Loss')plt.title('CNN Loss')plt.legend()plt.show()

## Final Section — Exam Notes### When to Use Which Algorithm- **Linear Regression**: Predict continuous values (regression tasks).- **Logistic Regression**: Binary/multiclass classification.- **Decision Trees/Random Forest**: Classification/regression, interpretable, handles nonlinearity.- **KNN**: Simple, non-parametric, small datasets.- **SVM**: High-dimensional, clear margin, small/medium datasets.- **ANN/CNN**: Complex patterns, large data, images (CNN).- **K-Means/Medoids/Hierarchical**: Unsupervised clustering.- **Naive Bayes**: Text, categorical, independence assumption.### Time Complexity (Short)- **Linear/Logistic Regression**: $O(nd^2)$- **Decision Tree**: $O(n \log n)$- **Random Forest**: $O(t n \log n)$ (t = trees)- **KNN**: $O(n d)$ per query- **SVM**: $O(n^2 d)$ to $O(n^3)$- **ANN**: Depends on layers/epochs, usually high- **K-Means**: $O(n k d t)$ (t = iterations)- **Naive Bayes**: $O(n d)$### Common Exam Mistakes- Not scaling features for distance-based models (KNN, SVM, K-Means)- Not encoding categoricals- Data leakage (using test data in training)- Not stratifying splits for classification- Ignoring class imbalance- Forgetting to check for missing values### Quick Tuning Tips- Use cross-validation for hyperparameter tuning- Try different K in KNN/K-Means- Adjust tree depth in Decision Trees/Random Forest- Use regularization in regression/SVM- Monitor overfitting with validation curves---Good luck! Paste your dataset, set the target, and run all cells for a full ML workflow.