In [7]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Sklearn modules
from sklearn.datasets import load_breast_cancer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score
from sklearn.feature_selection import SelectKBest, SelectPercentile, GenericUnivariateSelect, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler


#Solution for Question 1: Email Classification Using Naive Bayes

In [None]:
# Step 1: Importing Libraries

# Step 2: Load the Dataset
df = pd.read_csv("/content/drive/MyDrive/Concept and technology of AI/spam_ham_dataset.csv", index_col=0)



In [None]:
# Step 3: Text Preprocessing
nltk.download('stopwords')
ps = PorterStemmer()
corpus = []

for i in range(len(df['text'])):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    # Convert to lowercase
    text = text.lower()
    # Tokenize and remove stopwords
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')]
    # Join the processed words back into a single string
    text = ' '.join(text)
    corpus.append(text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Step 4: Creating Feature Matrix and Label Vector
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()  # Feature matrix
y = df['label_num'].values             # Labels (spam: 1, ham: 0)


In [None]:
# Step 5: Splitting the Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Step 6: Training the Naive Bayes Model
model = MultinomialNB()
model.fit(X_train, y_train)


In [None]:
# Step 7: Evaluating the Model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.6589371980676328
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.85      0.78       742
           1       0.32      0.18      0.23       293

    accuracy                           0.66      1035
   macro avg       0.52      0.52      0.51      1035
weighted avg       0.61      0.66      0.63      1035

Confusion Matrix:
 [[628 114]
 [239  54]]


#Question 2: Sentiment Analysis Using Naive Bayes

In [None]:
# Step 1: Load Dataset
nltk.download('stopwords')
data = pd.read_csv("/content/drive/MyDrive/Concept and technology of AI/IMDB Dataset.csv")  # Replace with the actual path
print(data.head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
# Step 2: Preprocess Text Data
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
    text = text.lower().split()  # Convert to lowercase and tokenize
    text = [ps.stem(word) for word in text if word not in stop_words]  # Remove stopwords & apply stemming
    return ' '.join(text)

# Replace original text with cleaned text
data['review_clean'] = data['review'].apply(preprocess_text)

In [None]:
# Step 3: Feature Matrix and Label Vector
cv = CountVectorizer(max_features=5000)  # Limit features for efficiency
X = cv.fit_transform(corpus).toarray()
y = data['sentiment'].map({'positive': 1, 'negative': 0}).values

In [None]:
# Step 4: Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 5: Train the Model
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
# Step 6: Evaluate the Model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# ROC-AUC Score
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("ROC-AUC Score:", roc_auc)

Accuracy: 0.9385964912280702
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.84      0.91        43
           1       0.91      1.00      0.95        71

    accuracy                           0.94       114
   macro avg       0.96      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114

Confusion Matrix:
 [[36  7]
 [ 0 71]]
ROC-AUC Score: 0.985260399606944


#Question 3: Feature Selection Methods in Scikit-learn

In [8]:
# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target


In [9]:
# Select top 10 features using ANOVA F-test
selector_kbest = SelectKBest(score_func=f_classif, k=10)
X_new_kbest = selector_kbest.fit_transform(X, y)

# Get selected feature names
selected_features_kbest = X.columns[selector_kbest.get_support()]
print("Selected Features (SelectKBest):", selected_features_kbest.tolist())


Selected Features (SelectKBest): ['mean radius', 'mean perimeter', 'mean area', 'mean concavity', 'mean concave points', 'worst radius', 'worst perimeter', 'worst area', 'worst concavity', 'worst concave points']


In [10]:
# Select top 20% of features
selector_percentile = SelectPercentile(score_func=f_classif, percentile=20)
X_new_percentile = selector_percentile.fit_transform(X, y)

# Get selected features
selected_features_percentile = X.columns[selector_percentile.get_support()]
print("Selected Features (SelectPercentile):", selected_features_percentile.tolist())


Selected Features (SelectPercentile): ['mean perimeter', 'mean concave points', 'worst radius', 'worst perimeter', 'worst area', 'worst concave points']


In [11]:
# Use "k best" mode
selector_generic = GenericUnivariateSelect(score_func=f_classif, mode='k_best', param=10)
X_new_generic = selector_generic.fit_transform(X, y)

# Get selected features
selected_features_generic = X.columns[selector_generic.get_support()]
print("Selected Features (GenericUnivariateSelect):", selected_features_generic.tolist())


Selected Features (GenericUnivariateSelect): ['mean radius', 'mean perimeter', 'mean area', 'mean concavity', 'mean concave points', 'worst radius', 'worst perimeter', 'worst area', 'worst concavity', 'worst concave points']


In [12]:
# Initialize logistic regression model
model = LogisticRegression(max_iter=200)

# Apply Recursive Feature Elimination to select top 10 features
rfe = RFE(estimator=model, n_features_to_select=10)
X_new_rfe = rfe.fit_transform(X, y)

# Get selected features
selected_features_rfe = X.columns[rfe.get_support()]
print("Selected Features (RFE):", selected_features_rfe.tolist())


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected Features (RFE): ['mean radius', 'mean concavity', 'mean concave points', 'mean symmetry', 'perimeter error', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply Lasso regularization
lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)

# Get selected features
selected_features_lasso = X.columns[lasso.coef_ != 0]
print("Selected Features (Lasso):", selected_features_lasso.tolist())


Selected Features (Lasso): ['mean concave points', 'worst radius', 'worst texture', 'worst concave points']


In [14]:
# Train a RandomForest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importance scores
importances = rf.feature_importances_

# Select features with importance > threshold (e.g., 0.02)
selected_features_rf = X.columns[importances > 0.02]
print("Selected Features (Random Forest):", selected_features_rf.tolist())


Selected Features (Random Forest): ['mean radius', 'mean perimeter', 'mean area', 'mean concavity', 'mean concave points', 'area error', 'worst radius', 'worst perimeter', 'worst area', 'worst concavity', 'worst concave points']
