In [None]:

#program 10
#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

data = load_breast_cancer()
X = data.data
y = data.target

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_scaled)

clusters = kmeans.predict(X_scaled)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[clusters == 0, 0], X_pca[clusters == 0, 1], s=50, c='red', label='Cluster 0 (Benign)')
plt.scatter(X_pca[clusters == 1, 0], X_pca[clusters == 1, 1], s=50, c='blue', label='Cluster 1 (Malignant)')

centroids = pca.transform(kmeans.cluster_centers_)
plt.scatter(centroids[:, 0], centroids[:, 1], s=200, c='black', marker='X', label='Centroids')

plt.title('K-Means Clustering on Breast Cancer Dataset')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()



In [None]:
#program 9
import numpy as np
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

data = fetch_olivetti_faces(shuffle=True, random_state=42)
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=1))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

cross_val_accuracy = cross_val_score(gnb, X, y, cv=5, scoring='accuracy')
print(f'\nCross-validation accuracy: {cross_val_accuracy.mean() * 100:.2f}%')

fig, axes = plt.subplots(3, 5, figsize=(12, 8))
for ax, image, label, prediction in zip(axes.ravel(), X_test, y_test, y_pred):
    ax.imshow(image.reshape(64, 64), cmap=plt.cm.gray)
    ax.set_title(f"True: {label}, Pred: {prediction}")
    ax.axis('off')

plt.show()

In [None]:
#program 8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report

data = load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Decision Tree on Test Set: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

plt.figure(figsize=(12, 8))
plot_tree(clf, filled=True, feature_names=data.feature_names, class_names=data.target_names, rounded=True)
plt.title("Decision Tree Visualized")
plt.show()

new_sample = np.array([[15.2, 19.5, 103.2, 800.2, 0.07, 0.3, 1.2, 4.2, 0.03, 0.1, 0.08, 0.05, 0.03, 0.03, 0.06,
0.09, 0.02, 0.07, 0.03, 0.08, 0.04, 0.02, 0.01, 0.04, 0.06, 0.03, 0.02, 0.01, 0.03, 0.02]])

new_prediction = clf.predict(new_sample)
print("\nNew sample classified as:", data.target_names[new_prediction][0])

In [None]:
#program 6
import numpy as np
import matplotlib.pyplot as plt

def locally_weighted_regression(X_train, Y_train, X_query, tau=0.1):
    m, n = X_train.shape
    Y_train = Y_train.reshape(-1, 1)

    X_train = np.concatenate([np.ones((m, 1)), X_train], axis=1)
    X_query = np.concatenate([np.ones((X_query.shape[0], 1)), X_query], axis=1)

    Y_pred = np.zeros(X_query.shape[0])

    for i in range(X_query.shape[0]):
        weights = np.exp(-np.sum((X_train - X_query[i])**2, axis=1) / (2 * tau**2))
        W = np.diag(weights)

        theta = np.linalg.inv(X_train.T @ W @ X_train) @ (X_train.T @ W @ Y_train)

        Y_pred[i] = X_query[i] @ theta

    return Y_pred

np.random.seed(42)
X = np.sort(np.random.rand(100, 1), axis=0)
Y = np.sin(2 * np.pi * X) + 0.1 * np.random.randn(100, 1)

X_query = np.linspace(0, 1, 100).reshape(-1, 1)

tau_values = [0.1, 0.3, 0.5, 1.0, 2.0]
plt.figure(figsize=(10, 6))

for tau in tau_values:
    Y_pred = locally_weighted_regression(X, Y, X_query, tau)
    plt.plot(X_query, Y_pred, label=f'tau={tau}')

plt.scatter(X, Y, color='red', label='Data points')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Locally Weighted Regression')
plt.legend()
plt.show()

In [None]:
#program 7
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score

def linear_regression_california():
    housing = fetch_california_housing(as_frame=True)
    X = housing.data[["AveRooms"]]
    y = housing.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    plt.scatter(X_test, y_test, color="blue", label="Actual")
    plt.plot(X_test, y_pred, color="red", label="Predicted")
    plt.xlabel("Average number of rooms (AveRooms)")
    plt.ylabel("Median value of homes ($100,000)")
    plt.title("Linear Regression - California Housing Dataset")
    plt.legend()
    plt.show()

    print("Linear Regression - California Housing Dataset")
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R^2 Score:", r2_score(y_test, y_pred))


def polynomial_regression_auto_mpg():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
    column_names = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration",
                    "model_year", "origin"]
    data = pd.read_csv(url, sep='\s+', names=column_names, na_values="?")
    data = data.dropna()

    X = data["displacement"].values.reshape(-1, 1)
    y = data["mpg"].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    poly_model = make_pipeline(PolynomialFeatures(degree=2), StandardScaler(), LinearRegression())
    poly_model.fit(X_train, y_train)

    y_pred = poly_model.predict(X_test)

    plt.scatter(X_test, y_test, color="blue", label="Actual")
    plt.scatter(X_test, y_pred, color="red", label="Predicted")
    plt.xlabel("Displacement")
    plt.ylabel("Miles per gallon (mpg)")
    plt.title("Polynomial Regression - Auto MPG Dataset")
    plt.legend()
    plt.show()

    print("Polynomial Regression - Auto MPG Dataset")
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R^2 Score:", r2_score(y_test, y_pred))


if __name__ == "__main__":
    print("Demonstrating Linear Regression and Polynomial Regression\n")
    linear_regression_california()
    polynomial_regression_auto_mpg()

In [None]:
#program 5 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

np.random.seed(42)
X = np.random.rand(100, 1)

y = np.zeros(100)
for i in range(50):
    if X[i] <= 0.5:
        y[i] = 1
    else:
        y[i] = 2

X_train = X[:50]
y_train = y[:50]
X_test = X[50:]

k_values = [1, 2, 3, 4, 5, 20, 30]
accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y[50:], y_pred)
    accuracies.append(accuracy)
    print(f'Accuracy for k={k}: {accuracy:.4f}')

plt.figure(figsize=(12, 10))
plt.scatter(X_train, y_train, color='blue', label='Training Data')

for k, accuracy in zip(k_values, accuracies):
    plt.scatter(X_test, y[50:], label=f'Test Data (k={k}, Accuracy={accuracy:.4f})')

plt.xlabel('X values')
plt.ylabel('Classes')
plt.title('k-NN Classification for Randomly Generated Data')
plt.legend()
plt.show()

X_full = np.linspace(0, 1, 500).reshape(-1, 1)
plt.figure(figsize=(12, 10))

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_full = knn.predict(X_full)
    plt.plot(X_full, y_pred_full, label=f'Decision Boundary for k={k}')

plt.scatter(X_train, y_train, color='blue', label='Training Data')
plt.scatter(X_test, y[50:], color='red', label='Test Data')

plt.xlabel('X values')
plt.ylabel('Classes')
plt.title('Decision Boundaries for Different k in k-NN')
plt.legend()
plt.show()

In [None]:
#program 4 
import pandas as pd

def find_s_algorithm(file_path):
    data = pd.read_csv(file_path)
    print("Training data:")
    print(data)

    attributes = data.columns[:-1]
    class_label = data.columns[-1]
    hypothesis = ['?' for _ in attributes]

    for index, row in data.iterrows():
        if row[class_label] == 'Yes':
            for i, value in enumerate(row[attributes]):
                if hypothesis[i] == '?' or hypothesis[i] == value:
                    hypothesis[i] = value
                else:
                    hypothesis[i] = '?'

    return hypothesis

file_path = 'training_data.csv'
hypothesis = find_s_algorithm(file_path)
print("\nThe final hypothesis is:", hypothesis)


In [None]:
#program 3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

iris = load_iris()
X = iris.data
y = iris.target

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA - Iris Dataset (Reduced to 2 Dimensions)')
plt.colorbar(label='Iris Species')
plt.show()

print('Explained variance ratio for each component:', pca.explained_variance_ratio_)

In [None]:
#program 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing

california_housing = fetch_california_housing(as_frame=True)
df = california_housing.frame

correlation_matrix = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of Features')
plt.show()

subset_features = df.columns[:5]
sns.pairplot(df[subset_features], hue=None, plot_kws={'alpha': 0.7})
plt.suptitle('Pair Plot of Selected Features', y=1.02)
plt.show()

In [None]:
#program 1 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing

california_housing = fetch_california_housing(as_frame=True)
df = california_housing.frame

numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()

plt.figure(figsize=(15, 12))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(3, 3, i)
    sns.histplot(df[feature], kde=True, bins=30)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

plt.figure(figsize=(15, 12))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(x=df[feature])
    plt.title(f'Box Plot of {feature}')
    plt.xlabel(feature)
plt.tight_layout()
plt.show()

for feature in numerical_features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    print(f'Feature: {feature}')
    print(f'  Q1: {Q1}, Q3: {Q3}, IQR: {IQR}')
    print(f'  Lower Bound: {lower_bound}, Upper Bound: {upper_bound}')
    print(f'  Number of outliers: {outliers.shape[0]}')
    print('---')