In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

pd.options.display.float_format = '{:.3f}'.format
import math

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")


In [None]:
df=pd.read_csv("//content/breast-cancer.csv")
df.info()
df.describe().T
df.columns

data.drop(columns=['id'], inplace=True)
X = data.drop(columns=['diagnosis'])
y = data['diagnosis'].map({'M': 1, 'B': 0})

def split(df,label):
    X_tr, X_te, Y_tr, Y_te = train_test_split(df, label, test_size=0.25, random_state=42)
    return X_tr, X_te, Y_tr, Y_te

# Normalization

scaler = Normalizer()

for i in [df.columns]:
    df[i] = scaler.fit_transform(df[i])

dataـbc=split(x,y)


In [None]:
#corrolation

def correlation(dataset, cor):
    dataـbc = dataset.copy()
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > cor:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    df.drop(col_corr,axis = 1,inplace = True)
    return dataـbc

correlation(x,0.9).shape


In [None]:
#variance

def variance_threshold(dataـbc,th):
    var_thres=VarianceThreshold(threshold=th)
    var_thres.fit(dataـbc)
    new_cols = var_thres.get_support()
    return dataـbc.iloc[:,new_cols]


In [None]:
dataـbc2=variance_threshold(x,0.1)
dataـbc 2.shape


In [None]:
# MI

mi_scores = mutual_info_classif(X_scaled, y)
selector = SelectKBest(score_func=mutual_info_classif, k=10)
selector.fit(X_scaled, y)
selected_features = X.columns[selector.get_support()]
mi_df = pd.DataFrame({'Feature': X.columns, 'Mutual Information Score': mi_scores})
mi_df['Selected'] = mi_ dataـbc ['Feature'].apply(lambda x: 'Selected' if x in selected_features else 'Not Selected')

plt.figure(figsize=(12, 6))
sns.barplot(x='Mutual Information Score', y='Feature', hue='Selected', data= mi_dataـbc, palette={'Selected': 'green', 'Not Selected': 'gray'})
plt.title('Mutual Information Scores for Breast Cancer Features (Top Features Highlighted)')
plt.xlabel('Mutual Information Score')
plt.ylabel('Feature')
plt.legend(title='Feature Selection')
plt.show()


In [None]:
#forward feture selection

clf = ExtraTreesClassifier(n_estimators=60, n_jobs=-1,random_state=42)

 sfs1 = SequentialFeatureSelector(clf,
 k_features=10,
 forward=True,
 floating=False,
 verbose=2,
 scoring='accuracy',
 cv=9)

sfs1 = SequentialFeatureSelector(
    clf, n_features_to_select=9, direction="forward"
)
sfs1 = sfs1.fit(x, y)


In [None]:
# backward feature selection

def backward_elimination_kmeans(X, y, threshold=0.1):
    features = X.columns.tolist()
    while len(features) > 0:
        kmeans = KMeans(n_clusters=2, random_state=42)
        kmeans.fit(X[features])
        silhouette_avg = silhouette_score(X[features], kmeans.labels_)
        print(f"feature: {features}, silhouette score: {silhouette_avg:.4f}")
        scores = []
        for feature in features:
            temp_features = [f for f in features if f != feature]
            kmeans_temp = KMeans(n_clusters=2, random_state=42)
            kmeans_temp.fit(X[temp_features])
            score = silhouette_score(X[temp_features], kmeans_temp.labels_)
            scores.append((feature, score))
        worst_feature, worst_score = min(scores, key=lambda x: x[1])
        if worst_score < silhouette_avg - threshold:
            features.remove(worst_feature)
            print(f"feature '{worst_feature}' deleted")
        else:
            break
    return features
selected_features = backward_elimination_kmeans(X, y)
print(selected_features)


In [None]:
#comprehensive feature selection

model = LogisticRegression(solver='liblinear')
efs = ExhaustiveFeatureSelector(model,
                                min_features=1,
                                max_features=10,
                                scoring='accuracy',
                                print_progress=True)

efs = efs.fit(X_train.values, y_train.values)

selected_features = X.columns[list(efs.best_idx_)]
print("selected features", selected_features.tolist())

print("model accuracy selected features:", efs.best_score_)



In [None]:
#regularization(ridg & lasso)

classifiers = ['Linear Reg',
               'Ridge a = 0.001','Ridge a = 0.01','Ridge a = 0.1',
               'Lasso a = 0.001','Lasso a = 0.01','Lasso a = 0.1']
models = [LinearRegression(),
          Ridge(alpha = 0.001),Ridge(alpha = 0.01),Ridge(alpha = 0.1),
          Lasso(alpha = 0.001),Lasso(alpha = 0.01),Lasso(alpha = 0.1)]
KFold_Score = pd.DataFrame()

splits = 5
cv = None
j = 0
for i in models:
    model = i
    cv = KFold(n_splits = splits, random_state = None, shuffle = True)
    KFold_Score[classifiers[j]] = np.sqrt(-(cross_val_score(model, train_data, train_label, scoring='neg_mean_squared_error', cv=cv)))
    j = j+1
mean = pd.DataFrame(KFold_Score.mean(), index= classifiers)

folds = []
for f in range(1,splits+1):
    fold_val = 'Fold ' + str(f)
    folds.append(fold_val)
folds.append('Mean')

KFold_Score = pd.concat([KFold_Score,mean.T])
KFold_Score.index = folds
KFold_Score = KFold_Score.T.sort_values(by=['Mean'], ascending = True)
KFold_Score = KFold_Score.reset_index()
KFold_Score.rename(columns={"index": "Regression Model"}, inplace=True)
KFold_Score


In [None]:
Test_Score = pd.DataFrame()
rmse_values = []

for i in range(len(classifiers)):
    model = models[i]
    model.fit(train_data,train_label)
    test_pred = model.predict(test_data)
    RMSE = mean_squared_error(test_label,test_pred, squared = False)
    rmse_values.append(RMSE)

Test_Score["Regression Model"] = classifiers
Test_Score["RMSE"] = rmse_values

Test_Score = Test_Score.sort_values(by=['RMSE'], ascending = True)
Test_Score.reset_index(drop=True)


In [None]:
#randomforest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, Y)
importances = rf.feature_importances_

feat_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

feat_importances = feat_importances.sort_values(by='Importance', ascending=False)
print(feat_importances)

plt.figure(figsize=(10,6))
plt.barh(feat_importances['Feature'], feat_importances['Importance'], color='skyblue')
plt.xlabel('feature_importance')
plt.ylabel('features')
plt.title('feature_importance_whit_randomforest')
plt.gca().invert_yaxis()
plt.show()


In [None]:
#Kmeans_model

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from matplotlib.mlab import PCA
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from subprocess import check_output
from pandas import read_csv

#Normalize
def normalizeData(x):
    for idx in ("radius_mean","fractal_dimension_worst"):
        x[idx]=x[idx]-min(x[idx])/(max(x[idx])-min(x[idx]))
    return x

def performPCA(x):
    pca=PCA(n_components=9,whiten=True)
    x=pca.fit(x).transform(x)
    return x

def createTrainTestDataSet(df_x,df_y):
   x_train,x_test,y_train,y_test=train_test_split(df_x,df_y,test_size=.2,random_state=4)
   return x_train,x_test,y_train,y_test

#Kmeans_function

def kmeansClustering(x_train,y_train):
    model=KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300)
    fittedModel=model.fit(x_train, y_train)
    return fittedModel

def envSetup():
    print("Read data from CSV ::")
    bcData=readDataSet()
    print("Dataset created successfully!!!")
    print("Cleanse dataset ::")
    df_x,df_y=cleanseDataSet(bcData)
    print("Dataset cleansed successfully!!!")
    print("Normalize dataset ::")
    df_x=normalizeData(df_x)
    print("Dataset Normalized successfully!!!")
    print("Find correlation using PCA ::")
    x_train,x_test,y_train,y_test=createTrainTestDataSet(df_x,df_y)
    print("Dataset divided as 80% train dataset & 20 test dataset")
    return x_train,x_test,y_train,y_test


In [None]:
def choiceKMeans():
    print("++++++++++++++++++ K_MEANS CLUSTERING ++++++++++++++++++")
    print("Perform K_Means Classification ::")
    fittedModel=kmeansClustering(x_train,y_train)
    print("K-Means train model created successfully!!!")
    print("Validate the clustering model ::")
    predictions=getPrediction(fittedModel,x_test)
    print("Clustering model tested successfully!!!")
    print("Get the Confussion Matrix ::")
    confusion_mat=getConfusionMatrix(y_test, predictions)
    print(confusion_mat)
    print("Get the Accuracy ::")
    accuracy=getAccuracy(y_test, predictions)
    print(accuracy)


In [None]:
# For correlation
x_train,x_test,y_train,y_test=envSetup()

Read data from CSV ::
Dataset created successfully!!!
Cleanse dataset ::
Dataset cleansed successfully!!!
Normalize dataset ::
Dataset Normalized successfully!!!
Create Train & Test Data set ::
Dataset divided as 80% train dataset & 20 test dataset
++++++++++++++++++ K_MEANS CLUSTERING ++++++++++++++++++
Perform K_Means Classification ::
K-Means train model created successfully!!!
Validate the clustering model ::
Clustering model tested successfully!!!


In [None]:
# For variance-treshold
x_train,x_test,y_train,y_test=envSetup()

Read data from CSV ::
Dataset created successfully!!!
Cleanse dataset ::
Dataset cleansed successfully!!!
Normalize dataset ::
Dataset Normalized successfully!!!
Create Train & Test Data set ::
Dataset divided as 80% train dataset & 20 test dataset
++++++++++++++++++ K_MEANS CLUSTERING ++++++++++++++++++
Perform K_Means Classification ::
K-Means train model created successfully!!!
Validate the clustering model ::
Clustering model tested successfully!!!


In [None]:
# For MI
x_train,x_test,y_train,y_test=envSetup()

Read data from CSV ::
Dataset created successfully!!!
Cleanse dataset ::
Dataset cleansed successfully!!!
Normalize dataset ::
Dataset Normalized successfully!!!
Create Train & Test Data set ::
Dataset divided as 80% train dataset & 20 test dataset
++++++++++++++++++ K_MEANS CLUSTERING ++++++++++++++++++
Perform K_Means Classification ::
K-Means train model created successfully!!!
Validate the clustering model ::
Clustering model tested successfully!!!


In [None]:
# For Forward-fs
x_train,x_test,y_train,y_test=envSetup()

Read data from CSV ::
Dataset created successfully!!!
Cleanse dataset ::
Dataset cleansed successfully!!!
Normalize dataset ::
Dataset Normalized successfully!!!
Create Train & Test Data set ::
Dataset divided as 80% train dataset & 20 test dataset
++++++++++++++++++ K_MEANS CLUSTERING ++++++++++++++++++
Perform K_Means Classification ::
K-Means train model created successfully!!!
Validate the clustering model ::
Clustering model tested successfully!!!


In [None]:
# For Backward-fs
x_train,x_test,y_train,y_test=envSetup()

Read data from CSV ::
Dataset created successfully!!!
Cleanse dataset ::
Dataset cleansed successfully!!!
Normalize dataset ::
Dataset Normalized successfully!!!
Create Train & Test Data set ::
Dataset divided as 80% train dataset & 20 test dataset
++++++++++++++++++ K_MEANS CLUSTERING ++++++++++++++++++
Perform K_Means Classification ::
K-Means train model created successfully!!!
Validate the clustering model ::
Clustering model tested successfully!!!


In [None]:
# For comperhansive-fs
x_train,x_test,y_train,y_test=envSetup()

Read data from CSV ::
Dataset created successfully!!!
Cleanse dataset ::
Dataset cleansed successfully!!!
Normalize dataset ::
Dataset Normalized successfully!!!
Create Train & Test Data set ::
Dataset divided as 80% train dataset & 20 test dataset
++++++++++++++++++ K_MEANS CLUSTERING ++++++++++++++++++
Perform K_Means Classification ::
K-Means train model created successfully!!!
Validate the clustering model ::
Clustering model tested successfully!!!


In [None]:
# For regularization-fs
x_train,x_test,y_train,y_test=envSetup()

Read data from CSV ::
Dataset created successfully!!!
Cleanse dataset ::
Dataset cleansed successfully!!!
Normalize dataset ::
Dataset Normalized successfully!!!
Create Train & Test Data set ::
Dataset divided as 80% train dataset & 20 test dataset
++++++++++++++++++ K_MEANS CLUSTERING ++++++++++++++++++
Perform K_Means Classification ::
K-Means train model created successfully!!!
Validate the clustering model ::
Clustering model tested successfully!!!


In [None]:
# For randomforest-fs
x_train,x_test,y_train,y_test=envSetup()

Read data from CSV ::
Dataset created successfully!!!
Cleanse dataset ::
Dataset cleansed successfully!!!
Normalize dataset ::
Dataset Normalized successfully!!!
Create Train & Test Data set ::
Dataset divided as 80% train dataset & 20 test dataset
++++++++++++++++++ K_MEANS CLUSTERING ++++++++++++++++++
Perform K_Means Classification ::
K-Means train model created successfully!!!
Validate the clustering model ::
Clustering model tested successfully!!!
