# TY34 Adnan

## Aim: Apply feature selection techniques like variance thresholding and correlation analysis using Python's scikit-learn library to reduce dimensionality in a dataset.

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

In [None]:
#iris = pd.read_csv("iris.csv")
#data = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])
#print("Original Dataset:")
data = pd.read_csv("iris.csv")
print(data.head())

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [None]:
data.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [None]:
data.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [None]:
X = data.iloc[:, :-1]
y = data['Species']

variance_threshold = 0.2
selector = VarianceThreshold(threshold=variance_threshold)
X_high_variance = selector.fit_transform(X)

print("\nDataset after Variance Thresholding:")
print(pd.DataFrame(X_high_variance, columns=X.columns[selector.get_support(indices=True)]).head())


Dataset after Variance Thresholding:
    Id  SepalLengthCm  PetalLengthCm  PetalWidthCm
0  1.0            5.1            1.4           0.2
1  2.0            4.9            1.4           0.2
2  3.0            4.7            1.3           0.2
3  4.0            4.6            1.5           0.2
4  5.0            5.0            1.4           0.2


In [None]:
correlation_threshold = 0.7
correlation_matrix = X.corr()
upper_triangular_mask = np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
highly_correlated_features = [column for column in correlation_matrix.columns
                              if any(correlation_matrix[column].abs() > correlation_threshold)]
X_low_correlation = X.drop(columns=highly_correlated_features)

print("\nDataset after Correlation Analysis:")
print(X_low_correlation.head())


Dataset after Correlation Analysis:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_low_correlation, y, test_size=0.2, random_state=42)

In [None]:
k_best_features = 2
selector_kbest = SelectKBest(score_func=f_classif, k=k_best_features)
X_train_kbest = selector_kbest.fit_transform(X_train, y)
X_test_kbest = selector_kbest.transform(X_test)

print(f"\nDataset after SelectKBest Feature Selection (Top {k_best_features} features):")
print(pd.DataFrame(X_train_kbest, columns=X_low_correlation.columns[selector_kbest.get_support(indices=True)]).head())

ValueError: at least one array or dtype is required