
# Session 8: Introduction to Machine Learning with scikit-learn

## Objective
Explore basic machine learning concepts and applications using Scikit-Learn with visualizations and real-world datasets.


In [None]:

# Install required packages
pip install scikit-learn matplotlib seaborn pandas --quiet


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.datasets import load_diabetes


## Using Real-World Dataset: Diabetes

In [None]:

# Load diabetes dataset
diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df['target'] = diabetes.target
df.head()


## Data Visualization

In [None]:

# Visualize correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()


## Supervised Learning: Linear Regression

In [None]:

# Selecting features and target
X = df[['bmi']]  # Body Mass Index
y = df['target']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Plotting predictions
plt.figure(figsize=(8, 5))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', label='Predicted')
plt.title("Linear Regression: BMI vs Target")
plt.xlabel("BMI")
plt.ylabel("Disease Progression")
plt.legend()
plt.show()


## Unsupervised Learning: K-Means Clustering

In [None]:

# Using two features for clustering
X = df[['bmi', 'bp']]

# Applying K-Means
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X)
df['cluster'] = clusters

# Visualizing clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='bmi', y='bp', hue='cluster', palette='Set2')
plt.title("K-Means Clustering (BMI vs BP)")
plt.xlabel("BMI")
plt.ylabel("Blood Pressure")
plt.show()



## 🔍 Bonus Challenge

Use any real-world dataset (from Kaggle, UCI, or `sklearn.datasets`) and do the following:

1. Load and explore the dataset.
2. Visualize the data using heatmaps, scatter plots, etc.
3. Train a linear regression model using one or more features.
4. Evaluate the model using Mean Squared Error or R² score.
5. Optional: Apply KMeans clustering and visualize the clusters.

You can try this with datasets like:
- California Housing (`from sklearn.datasets import fetch_california_housing`)
- Iris
- Wine

Happy experimenting! 🎯
