<a href="https://colab.research.google.com/github/Ashmit990/AI-Worksheet-0/blob/main/Worksheet9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [4]:
from google.colab import files
uploaded = files.upload()

Saving IMDB Dataset.csv to IMDB Dataset.csv


In [6]:
# Load dataset
df = pd.read_csv("IMDB Dataset.csv")   # columns: review, sentiment
df["sentiment"] = df["sentiment"].map({"positive":1, "negative":0})

# Clean text
df["review"] = df["review"].str.lower().apply(lambda x: re.sub("[^a-z ]","",x))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["review"], df["sentiment"], test_size=0.2, random_state=42
)

# Vectorization
cv = CountVectorizer(stop_words="english")
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

# Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8571
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      4961
           1       0.87      0.84      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



Confusion Matrix and ROC-AUC

> Add blockquote



In [7]:
from sklearn.metrics import confusion_matrix, roc_auc_score

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))


Confusion Matrix:
 [[4346  615]
 [ 814 4225]]
ROC-AUC: 0.9248014849223428


TASK 3: Feature Selection â€“ SelectKBest

In [8]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest, f_classif


In [None]:
X, y = load_breast_cancer(return_X_y=True)

selector = SelectKBest(f_classif, k=5)
X_new = selector.fit_transform(X, y)

print("Selected Feature Indices:", selector.get_support(indices=True))


TASK 4: RFE

In [17]:
# TASK 4: Wrapper Method (RFE) - FINAL FIXED VERSION

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Feature scaling (THIS FIXES THE WARNING)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# RFE with Logistic Regression
model = LogisticRegression(max_iter=1000)
rfe = RFE(model, n_features_to_select=5)

X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# Selected features
print("Selected Features:")
for f in feature_names[rfe.support_]:
    print("-", f)

# Train and evaluate
model.fit(X_train_rfe, y_train)
y_pred = model.predict(X_test_rfe)
y_prob = model.predict_proba(X_test_rfe)[:, 1]

print("\nPerformance (Selected Features)")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

# Comparison with all features
model.fit(X_train, y_train)
y_pred_all = model.predict(X_test)

print("\nComparison")
print("Accuracy (All Features):", accuracy_score(y_test, y_pred_all))
print("Accuracy (Selected Features):", accuracy_score(y_test, y_pred))

# Experiment with different feature counts
for k in [3, 7]:
    rfe_k = RFE(model, n_features_to_select=k)
    X_train_k = rfe_k.fit_transform(X_train, y_train)
    X_test_k = rfe_k.transform(X_test)

    model.fit(X_train_k, y_train)
    y_pred_k = model.predict(X_test_k)

    print(f"Accuracy with top {k} features:",
          accuracy_score(y_test, y_pred_k))


Selected Features:
- radius error
- worst radius
- worst texture
- worst area
- worst concave points

Performance (Selected Features)
Accuracy: 0.9736842105263158
Precision: 0.9722222222222222
Recall: 0.9859154929577465
F1 Score: 0.9790209790209791
ROC-AUC: 0.9977071732721914

Comparison
Accuracy (All Features): 0.9736842105263158
Accuracy (Selected Features): 0.9736842105263158
Accuracy with top 3 features: 0.9649122807017544
Accuracy with top 7 features: 0.9736842105263158


TASK 5: Model Comparison

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [19]:
# All features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train)
print("All Features Accuracy:", accuracy_score(y_test, model.predict(X_test)))

# Selected features
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.2)
model.fit(X_train, y_train)
print("Selected Features Accuracy:", accuracy_score(y_test, model.predict(X_test)))


All Features Accuracy: 0.956140350877193
Selected Features Accuracy: 0.9122807017543859


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


TASK 6: Naive Bayes From Scratch

In [20]:
import numpy as np
from collections import defaultdict


In [21]:
texts = ["hello friend", "win money now", "hello win"]
labels = np.array([0,1,1])  # 0=ham, 1=spam

vocab = set(word for t in texts for word in t.split())
vocab = list(vocab)

prior = {
    0: np.log((labels==0).sum()/len(labels)),
    1: np.log((labels==1).sum()/len(labels))
}

likelihood = defaultdict(lambda:[1,1])  # Laplace
class_words = [0,0]

for t,l in zip(texts,labels):
    for w in t.split():
        likelihood[w][l]+=1
        class_words[l]+=1

def predict(text):
    scores = prior.copy()
    for w in text.split():
        if w in likelihood:
            scores[0]+=np.log(likelihood[w][0]/class_words[0])
            scores[1]+=np.log(likelihood[w][1]/class_words[1])
    return 1 if scores[1]>scores[0] else 0

print(predict("win money"))
print(predict("hello friend"))


1
0
