View Readme file for the colab link

# ML ALGORITHMS





# ML ALGORITHMS



### LINEAR REGRESSION

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.datasets import load_diabetes

In [None]:


# ========== LINEAR REGRESSION: House Price Prediction ==========
print("\n LINEAR REGRESSION: House Price Prediction\n")

# Sample dataset
data = {
    "Size (sqft)": [500, 700, 1000, 1200, 1500, 1800, 2000, 2300],
    "Price (Lakh ₹)": [20, 28, 40, 48, 60, 72, 80, 92]
}
df = pd.DataFrame(data)
X = df[["Size (sqft)"]]
y = df["Price (Lakh ₹)"]

# Train model
model = LinearRegression()
model.fit(X, y)

#  User input
user_size = int(input("Enter house size in sqft: "))
predicted_price = model.predict([[user_size]])[0]
print(f"✅ Estimated price: ₹{predicted_price:.2f} Lakhs")

#  Plot
plt.figure(figsize=(6, 4))
plt.scatter(X, y, color='blue', label='Training Data')
plt.plot(X, model.predict(X), color='red', label='Regression Line')
plt.scatter(user_size, predicted_price, color='green', label='Your Input', s=100)
plt.xlabel("Size (sqft)")
plt.ylabel("Price (Lakh ₹)")
plt.title("Linear Regression: House Size vs Price")
plt.legend()
plt.grid(True)
plt.show()

### LOGISTIC REGRESSION


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.datasets import load_diabetes

# ========== LOGISTIC REGRESSION: Diabetes Prediction ==========
print("\n LOGISTIC REGRESSION: Diabetes Risk Prediction\n")

# Load dataset and convert to binary classification
diabetes = load_diabetes(as_frame=True)
df2 = diabetes.frame
df2['Outcome'] = (df2['target'] > 140).astype(int)
X2 = df2[['bmi', 'bp']]
y2 = df2['Outcome']

# Train logistic model
log_model = LogisticRegression()
log_model.fit(X2, y2)

#  User input
user_bmi = float(input("Enter normalized BMI (between -0.1 and 0.2): "))
user_bp = float(input("Enter normalized Blood Pressure (between 0.01 and 0.2): "))

log_pred = log_model.predict([[user_bmi, user_bp]])[0]
log_prob = log_model.predict_proba([[user_bmi, user_bp]])[0]

status = "Diabetic" if log_pred == 1 else "Non-Diabetic"
print(f" Prediction: {status}")
print(f" Probability: {log_prob[1]*100:.2f}% chance of being diabetic")

#  Plot
plt.figure(figsize=(6, 4))
plt.scatter(X2['bmi'], X2['bp'], c=y2, cmap='bwr', edgecolors='k', alpha=0.6)
plt.scatter(user_bmi, user_bp, color='black', s=100, label='Your Input')
plt.xlabel("BMI (normalized)")
plt.ylabel("Blood Pressure (normalized)")
plt.title("Logistic Regression: Diabetes Risk")
plt.legend()
plt.grid(True)
plt.show()

### DECISION TREE

In [None]:
#--Decision Trees-- Titanic Survival Predictions

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report

In [None]:

# Load Titanic dataset
df = sns.load_dataset('titanic')
# Show initial data
df[['survived', 'pclass', 'sex', 'age', 'fare']].head()

In [None]:
# Keep only useful columns
df = df[['survived', 'pclass', 'sex', 'age', 'fare']]
# Drop rows with missing values
df = df.dropna()
# Encode 'sex' as 0 (male) and 1 (female)
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
# Split into features and label
X = df[['pclass', 'sex', 'age', 'fare']]
y = df['survived']


In [None]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train decision tree
model = DecisionTreeClassifier(max_depth=3, random_state=42)
model.fit(X_train, y_train)


In [None]:
#Predictions
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=['Not Survived', 'Survived']))


In [None]:
#Plot
plt.figure(figsize=(14, 8))
plot_tree(model, feature_names=X.columns, class_names=['Not Survived', 'Survived'], filled=True)
plt.title("Titanic Decision Tree")
plt.show()


In [None]:
# Format: [pclass, sex, age, fare]
# Example: 3rd class, female, 22 yrs, fare $155
sample = [[1, 1, 22, 155]]

prediction = model.predict(sample)
print("Prediction:", "Survived" if prediction[0] == 1 else "Not Survived")

In [None]:
#Show Feature Importances
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False).plot(kind='barh', title='Feature Importance')
plt.show()

In [None]:
#@title Gradient Boosting (XG Boost)
# 📦 Import libraries
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt


# 📥 Load and preprocess Titanic dataset
df = sns.load_dataset("titanic")

# Select useful columns
df = df[["survived", "pclass", "sex", "age", "fare", "embarked"]]
df.dropna(inplace=True)  # Drop rows with missing values

# Encode categorical features
df["sex"] = LabelEncoder().fit_transform(df["sex"])
df["embarked"] = LabelEncoder().fit_transform(df["embarked"])

# Split into features and target
X = df.drop("survived", axis=1)
y = df["survived"]

# 🔀 Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# 🚀 Train XGBoost classifier
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False
)

model.fit(X_train, y_train)


# 📊 Evaluate model
y_pred = model.predict(X_test)

print("🎯 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))


# 🔍 Feature Importance
xgb.plot_importance(model)
plt.title("Feature Importance - Titanic")
plt.show()


### K NEAREST NEIGHBORS(KNN)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors,KNeighborsClassifier
from sklearn.metrics import r2_score,accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


#### KNN REGRESSOR

In [None]:


# Load the sample regression dataset from CSV
sample_df = pd.read_csv("https://raw.githubusercontent.com/BARATHVISHNU-J/Core_ML_Algorithms/main/sample_regression.csv")
print(sample_df.head())  # Display the first few rows of the dataset

# Features: feature1 and feature2, Target: target
X = sample_df[['feature1', 'feature2']].values
y = sample_df['target'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

# Create a KNN regressor object with k=3
knn = KNeighborsRegressor(n_neighbors=3)

# Train the KNN regressor using the training data
knn.fit(X_train, y_train)

# Predict the target for the test set
y_pred = knn.predict(X_test)
# Predict the target for the train set
y_train_pred = knn.predict(X_train)

train_preds = knn.predict(X_train)
test_preds = knn.predict(X_test)
print("\nPredicted target values:")
print("Train set predictions:")
print(train_preds)
print("Test set predictions:")
print(test_preds)
# Calculate and print the mean squared error (mse) and R^2 score of the regressor on the test set

r2 = r2_score(y_test, y_pred)
print("R^2 Score (Accuracy):", (round(r2, 3))*100, "%")

# Visualization: plot feature1 vs feature2 for train and test sets (no user input, solid colors, with target values)
plt.figure(figsize=(8, 5))
plt.scatter(X_train[:, 0], X_train[:, 1], color='blue', label='Train', alpha=0.6, edgecolor='k')
plt.scatter(X_test[:, 0], X_test[:, 1], color='red', label='Test', alpha=0.8, edgecolor='k')
# Annotate train points with their target values
for i in range(len(X_train)):
    plt.text(X_train[i, 0], X_train[i, 1], f'{y_train[i]:.2f}', fontsize=8, color='navy', ha='right', va='bottom')
# Annotate test points with their target values
for i in range(len(X_test)):
    plt.text(X_test[i, 0], X_test[i, 1], f'{y_test[i]:.2f}', fontsize=8, color='darkred', ha='left', va='top')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('KNN Regression: Feature Space (Train vs Test)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()



##### USER INPUT


In [None]:
# --- User input and prediction ---
user_point = None
mean_pred = None
median_pred = None
f1 = float(input("Enter value for feature1: "))
f2 = float(input("Enter value for feature2: "))
user_point = np.array([[f1, f2]])
# Predict using mean (default KNN)
mean_pred = knn.predict(user_point)[0]
# Predict using median of k nearest neighbors
nn = NearestNeighbors(n_neighbors=3)
nn.fit(X_train)
distances, indices = nn.kneighbors(user_point)
neighbor_targets = y_train[indices[0]]
median_pred = np.median(neighbor_targets)
print(f"Predicted target (mean of neighbors): {mean_pred:.2f}")
print(f"Predicted target (median of neighbors): {median_pred:.2f}")

# Visualization: plot feature1 vs feature2 for train, test, and user input (solid colors, with target values)
plt.figure(figsize=(8, 5))
plt.scatter(X_train[:, 0], X_train[:, 1], color='blue', label='Train', alpha=0.6, edgecolor='k')
plt.scatter(X_test[:, 0], X_test[:, 1], color='red', label='Test', alpha=0.8, edgecolor='k')
plt.scatter(f1, f2, color='magenta', s=200, marker='*', label=f'User Input (pred: {mean_pred:.2f})')
# Annotate train points with their target values
for i in range(len(X_train)):
    plt.text(X_train[i, 0], X_train[i, 1], f'{y_train[i]:.2f}', fontsize=8, color='navy', ha='right', va='bottom')
# Annotate test points with their target values
for i in range(len(X_test)):
    plt.text(X_test[i, 0], X_test[i, 1], f'{y_test[i]:.2f}', fontsize=8, color='darkred', ha='left', va='top')
# Annotate user input with its predicted value
plt.text(f1, f2, f'{mean_pred:.2f}', fontsize=10, color='magenta', ha='center', va='bottom', fontweight='bold')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('KNN Regression: Feature Space (Train, Test, User Input)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

#### KNN CLASSIFIER

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors,KNeighborsClassifier
from sklearn.metrics import r2_score,accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load the movies dataset from CSV
movies_df = pd.read_csv("https://raw.githubusercontent.com/BARATHVISHNU-J/Core_ML_Algorithms/main/movies.csv")

# Encode movie genres as integers for KNN classification
le_genre = LabelEncoder()
movies_df['genre_encoded'] = le_genre.fit_transform(movies_df['genre'])

print(movies_df)
# Features: imdb_rating and duration, Target: genre_encoded
X = movies_df[['imdb_rating', 'duration']].values
y = movies_df['genre_encoded'].values

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the dataset into training and testing sets (indices for original X)
train_idx, test_idx = train_test_split(np.arange(len(X)), test_size=0.4, random_state=4)
X_train, X_test, y_train, y_test = X_scaled[train_idx], X_scaled[test_idx], y[train_idx], y[test_idx]

# Use original (unscaled) data for plotting
X_train_orig, X_test_orig = X[train_idx], X[test_idx]

# Find the best k for KNN
best_k = 1
best_acc = 0
for k in range(1, min(11, len(X_train))):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    if acc > best_acc:
        best_acc = acc
        best_k = k
print(f"Best k: {best_k} with accuracy: {round(best_acc*100, 2)}%")

# Use the best k for final model
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
# Calculate and print the accuracy of the classifier on the test set
print("Final Accuracy:", round(accuracy_score(y_test, y_pred)*100, 2), "%")

# Visualization: plot IMDb rating vs duration, color by genre
# Define a list of distinct colors for each genre
unique_genres = len(le_genre.classes_)
color_list = plt.cm.tab10(np.linspace(0, 1, unique_genres))

plt.figure(figsize=(10, 6))
# Plot all movies with genre-specific colors
for idx, genre in enumerate(le_genre.classes_):
    mask = movies_df['genre'] == genre
    plt.scatter(movies_df.loc[mask, 'imdb_rating'], movies_df.loc[mask, 'duration'],
                color=color_list[idx], label=genre, alpha=0.7, edgecolor='k')
# Highlight test set with a different color (e.g., yellow)
plt.scatter(X_test_orig[:, 0], X_test_orig[:, 1], c='yellow', edgecolor='black', s=120, label='Test Movies')
plt.xlabel('IMDb Rating')
plt.ylabel('Duration (min)')
plt.title('KNN Classification: Predicting Movie Genre from IMDb Rating and Duration')
plt.legend()
plt.tight_layout()
plt.show()


# Print test set details: IMDb rating, duration, true genre, predicted genre, and movie name
print("\nTest Set Details:")
test_indices = X_test.shape[0]
for i in range(test_indices):
    # Use original (unscaled) test data for matching
    imdb_val = X_test_orig[i, 0]
    duration_val = X_test_orig[i, 1]
    mask = (movies_df['imdb_rating'] == imdb_val) & (movies_df['duration'] == duration_val)
    movie_name = movies_df[mask]['name'].values[0] if not movies_df[mask].empty else 'Unknown'
    true_genre = le_genre.inverse_transform([y_test[i]])[0]
    pred_genre = le_genre.inverse_transform([y_pred[i]])[0]
    print(f"Movie: {movie_name}, IMDb: {imdb_val}, Duration: {duration_val}, True Genre: {true_genre}, Predicted Genre: {pred_genre}")




#### user input

In [None]:
# --- User input and prediction ---
try:
    user_rating = float(input("Enter IMDb rating: "))
    user_duration = float(input("Enter duration (min): "))
    user_point = scaler.transform(np.array([[user_rating, user_duration]]))
    user_pred = knn.predict(user_point)[0]
    user_genre = le_genre.inverse_transform([user_pred])[0]
    print(f"Predicted genre for input (IMDb: {user_rating}, Duration: {user_duration}): {user_genre}")
    # Plot training data and user input
    plt.figure(figsize=(8, 5))
    for genre in le_genre.classes_:
        genre_code = le_genre.transform([genre])[0]
        mask = y_train == genre_code
        if np.any(mask):
            plt.scatter(X_train_orig[mask, 0], X_train_orig[mask, 1], color=color_list[genre_code], label=genre, alpha=0.7, edgecolor='k')
        else:
            plt.scatter([], [], color=color_list[genre_code], label=genre)
    plt.scatter(user_rating, user_duration, color='yellow', edgecolor='black', s=200, marker='*', label='User Input')
    plt.xlabel('IMDb Rating')
    plt.ylabel('Duration (min)')
    plt.title('Training Samples and User Input')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"User input prediction skipped: {e}")

### SUPPORT VECTOR MACHINE (SVM) CLASSIFIER

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Load simple dataset
df = pd.read_csv("https://raw.githubusercontent.com/BARATHVISHNU-J/Core_ML_Algorithms/main/dataset.csv")
X = df[['feature1', 'feature2']].values
y = df['label'].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train SVM
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

# Predict and evaluate
y_pred = svm.predict(X_test)
print('Accuracy:', round(accuracy_score(y_test, y_pred),2)*100, '%')

# Plot data and decision boundary
plt.figure(figsize=(7,5))
plt.scatter(X_train[y_train==0,0], X_train[y_train==0,1], color='blue', label='Class 0 (train)')
plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='red', label='Class 1 (train)')
plt.scatter(X_test[y_test==0,0], X_test[y_test==0,1], color='cyan', marker='x', label='Class 0 (test)')
plt.scatter(X_test[y_test==1,0], X_test[y_test==1,1], color='orange', marker='x', label='Class 1 (test)')

# Plot SVM decision boundary and margins
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = svm.decision_function(xy).reshape(XX.shape)
plt.contour(XX, YY, Z, colors='k', levels=[0], alpha=0.8, linestyles=['--'], linewidths=2, label='Decision boundary')
plt.contour(XX, YY, Z, colors='grey', levels=[-1, 1], alpha=0.5, linestyles=[':'], linewidths=2)
# Plot support vectors
plt.scatter(svm.support_vectors_[:, 0], svm.support_vectors_[:, 1], s=150, linewidth=2, facecolors='none', edgecolors='k', label='Support Vectors')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Simple SVM Example')
plt.legend()
plt.tight_layout()
plt.show()

#### user input

In [None]:
# --- User input and prediction ---
try:
    f1 = float(input('Enter value for feature1: '))
    f2 = float(input('Enter value for feature2: '))
    user_point = np.array([[f1, f2]])
    user_pred = svm.predict(user_point)[0]
    print(f'Predicted class for input ({f1}, {f2}): {user_pred}')
    # Plot again with user point
    plt.figure(figsize=(7,5))
    plt.scatter(X_train[y_train==0,0], X_train[y_train==0,1], color='blue', label='Class 0 (train)')
    plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='red', label='Class 1 (train)')
    plt.scatter(X_test[y_test==0,0], X_test[y_test==0,1], color='cyan', marker='x', label='Class 0 (test)')
    plt.scatter(X_test[y_test==1,0], X_test[y_test==1,1], color='orange', marker='x', label='Class 1 (test)')
    # Decision boundary and margins
    ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    xx = np.linspace(xlim[0], xlim[1], 30)
    yy = np.linspace(ylim[0], ylim[1], 30)
    YY, XX = np.meshgrid(yy, xx)
    xy = np.vstack([XX.ravel(), YY.ravel()]).T
    Z = svm.decision_function(xy).reshape(XX.shape)
    plt.contour(XX, YY, Z, colors='k', levels=[0], alpha=0.8, linestyles=['--'], linewidths=2)
    plt.contour(XX, YY, Z, colors='grey', levels=[-1, 1], alpha=0.5, linestyles=[':'], linewidths=2)
    plt.scatter(svm.support_vectors_[:, 0], svm.support_vectors_[:, 1], s=150, linewidth=2, facecolors='none', edgecolors='k', label='Support Vectors')
    # Plot user input
    plt.scatter(f1, f2, color='magenta', s=200, marker='*', label='User Input')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('SVM with User Input')
    plt.legend()
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f'User input prediction skipped: {e}')

## PRINCIPAL COMPONENT ANALYSIS (PCA)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Create a simple 2D dataset
X = np.array([[2, 8], [3, 7], [4, 6], [5, 5], [6, 4], [7, 3], [8, 2]])

# Fit PCA to reduce to 1 principal component
pca = PCA(n_components=1)
X_pca = pca.fit_transform(X)

print("Original Data:\n", X)
print("Transformed Data (1D):\n", X_pca)

# Inverse transform to get back to 2D (approximate)
X_inv = pca.inverse_transform(X_pca)
print("Reconstructed Data (from 1D):\n", X_inv)

# Plot original and reconstructed data side by side for comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Original data
axes[0].scatter(X[:, 0], X[:, 1], color='blue', label='Original Data')
axes[0].set_title('Original Data')
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
axes[0].legend()

# Reconstructed data (from 1D)
axes[1].scatter(X_inv[:, 0], X_inv[:, 1], color='red', marker='x', label='Reconstructed (from 1D)')
axes[1].set_title('Reconstructed Data (from 1D PCA)')
axes[1].set_xlabel('Feature 1')
axes[1].set_ylabel('Feature 2')
axes[1].legend()

plt.suptitle('PCA Demonstration: Original vs Reconstructed Data')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# Naive Bayes


In [None]:
# 📥 Load dataset from OpenML
from sklearn.datasets import fetch_openml
import pandas as pd

df = fetch_openml("adult", version=2, as_frame=True).frame
df.head()

In [None]:
# ✅ Drop missing values (represented by '?')
df = df.replace('?', pd.NA).dropna()

In [None]:
# 🎯 Encode categorical features
from sklearn.preprocessing import LabelEncoder

for col in df.select_dtypes(include="category").columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Split features and target
X = df.drop("class", axis=1)
y = df["class"]

In [None]:
# 🔀 Train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# 🤖 Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

model = GaussianNB()
model.fit(X_train, y_train)

# 📊 Predictions
y_pred = model.predict(X_test)

In [None]:
# 📈 Evaluation
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Show a few actual vs predicted values
output_df = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": y_pred
})

# Show first 10 predictions
print("\n🔮 Sample Predictions:\n")
print(output_df.head(10))


# Random Forest Classifier

In [None]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, roc_curve, auc
)

#  Load dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
df = pd.read_csv(url, sep=';')

#  Binary classification: Good (>=6), Bad (<6)
df['quality_label'] = df['quality'].apply(lambda q: 1 if q >= 6 else 0)
X = df.drop(['quality', 'quality_label'], axis=1)
y = df['quality_label']

#  Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#  Random Forest with class balancing
model = RandomForestClassifier(n_estimators=150, max_depth=12, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

#  Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=["Bad Wine", "Good Wine"]))

#  Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Bad", "Good"], yticklabels=["Bad", "Good"])
plt.title("Confusion Matrix - Wine Quality (Binary)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

#  ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})', color='darkorange')
plt.plot([0, 1], [0, 1], linestyle='--', color='navy')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Wine Quality Classifier")
plt.legend()
plt.tight_layout()
plt.show()

#  Feature Importance
feat_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=True)

plt.figure(figsize=(8, 5))
sns.barplot(x='Importance', y='Feature', data=feat_importance, palette='viridis')
plt.title("Feature Importance (Random Forest)")
plt.tight_layout()
plt.show()

#  Class Balance Check
sns.countplot(x=y, palette='Set2')
plt.title("Good vs Bad Wine Class Distribution")
plt.xlabel("Wine Class (0 = Bad, 1 = Good)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


# K Means Clustering

In [None]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Iris dataset
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
df = pd.read_csv(url)


print(df.head())

X = df.drop("species", axis=1)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

inertia = []
K_range = range(1, 7)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X_scaled)
    inertia.append(km.inertia_)

plt.figure(figsize=(6,4))
plt.plot(K_range, inertia, 'bo-')
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia (Sum of Squared Distances)")
plt.title("Elbow Method for Iris Clustering")
plt.grid(True)
plt.show()

# KMeans with K=3
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df['Cluster'] = clusters

# Visualize clusters with PCA in 2D
pca = PCA(n_components=2)
components = pca.fit_transform(X_scaled)
df['PC1'], df['PC2'] = components[:,0], components[:,1]

plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='Set1', s=100, alpha=0.8)
plt.title("K-Means Clusters on Iris (PCA projection)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()

# Compare clusters with true species
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='species', palette='Set2', s=100, alpha=0.8)
plt.title("Actual Iris Species (PCA projection)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend(title='Species')
plt.grid(True)
plt.tight_layout()
plt.show()

# Cluster centers in original feature space
centers = scaler.inverse_transform(kmeans.cluster_centers_)
center_df = pd.DataFrame(centers, columns=X.columns).round(2)
print("Cluster Centers (unscaled feature averages):")
print(center_df)

# Cluster composition summary
print("\nCluster Composition:")
print(df.groupby(['Cluster', 'species']).size().unstack(fill_value=0))
