### Questions: 
1. Read and analyze the data
2. Split the dataset into train and test sets
3. Check shape of training and test sets
4. Perform scaling in the data using Standard Scaler
5. Calculate the % of missing values in a column
6. Remove features with missing values for greater than 20%
7. If the missing values are less than 20%, do data imputation (mean/median)
8. Remove the outliers
9. Use sklearn VarianceThreshold to find the constant features and display the constant features
10. Remove features with low variance
11. Remove highly correlated features
12. Perform Univariate feature selection (SelectKBest, SelectPercentile)
13. Apply Pearson Correlation Coefficient / Spearman's rank coefficient and find Correlation-Matrix with Heatmap
14. Apply Principal Component Analysis (PCA) for matrix factorization
15. Apply Linear Discriminant Analysis (LDA) to perform feature extraction


In [1]:
import pandas as pd

# Create a sample dataset
data = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': ['a', 'b', 'c', 'd', 'e'],
    'C': [10, 20, None, 40, 50]
})

# Perform initial exploration and analysis of the dataset
print(data.head())  # Display the first few rows of the dataset
print(data.info())  # Display information about the dataset


   A  B     C
0  1  a  10.0
1  2  b  20.0
2  3  c   NaN
3  4  d  40.0
4  5  e  50.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       5 non-null      int64  
 1   B       5 non-null      object 
 2   C       4 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 252.0+ bytes
None


In [2]:
from sklearn.model_selection import train_test_split

# Create a sample dataset with target variable
data_with_target = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': ['a', 'b', 'c', 'd', 'e'],
    'C': [10, 20, None, 40, 50],
    'Target': [0, 1, 0, 1, 0]
})

# Split the dataset into train and test sets
train_data, test_data = train_test_split(data_with_target, test_size=0.2, random_state=42)

# Optional: Print the shapes of the training and test sets
print("Training set shape:", train_data.shape)
print("Test set shape:", test_data.shape)


Training set shape: (4, 4)
Test set shape: (1, 4)


In [3]:
# Print the shapes of the training and test sets
print("Training set shape:", train_data.shape)
print("Test set shape:", test_data.shape)


Training set shape: (4, 4)
Test set shape: (1, 4)


In [4]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the scaler on the training data
train_data_scaled = scaler.fit_transform(train_data[['A', 'C']])

# Transform the test data using the fitted scaler
test_data_scaled = scaler.transform(test_data[['A', 'C']])


In [5]:
# Calculate the percentage of missing values in each column
missing_percentage = (data.isnull().sum() / len(data)) * 100
print("Missing values percentage:\n", missing_percentage)


Missing values percentage:
 A     0.0
B     0.0
C    20.0
dtype: float64


In [6]:
# Remove features with missing values greater than 20%
data = data.dropna(thresh=len(data) * 0.8, axis=1)


In [7]:
# Impute missing values with mean for columns with less than 20% missing values
data = data.fillna(data.mean())


TypeError: Could not convert ['abcde'] to numeric

In [None]:
# Remove outliers using z-score method
from scipy import stats
import numpy as np

z_scores = np.abs(stats.zscore(data.select_dtypes(include=np.number)))
data = data[(z_scores < 3).all(axis=1)]


In [None]:
from sklearn.feature_selection import VarianceThreshold

# Initialize VarianceThreshold
selector = VarianceThreshold()

# Fit the selector on the data
selector.fit(data)

# Get the constant features
constant_features = data.columns[~selector.get_support()]

print("Constant features:", constant_features)


In [None]:
# Remove features with low variance
data = data.drop(constant_features, axis=1)


In [None]:
# Calculate correlation matrix
corr_matrix = data.corr().abs()

# Create a boolean mask of highly correlated features
highly_correlated = (corr_matrix > 0.8) & (corr_matrix < 1)

# Drop the highly correlated features
data = data.drop(highly_correlated.columns[highly_correlated.any()], axis=1)


In [None]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif

# Perform SelectKBest feature selection
selector = SelectKBest(score_func=f_classif, k=2)
selected_features = selector.fit_transform(data.drop("Target", axis=1), data["Target"])

# Perform SelectPercentile feature selection
selector = SelectPercentile(score_func=f_classif, percentile=50)
selected_features = selector.fit_transform(data.drop("Target", axis=1), data["Target"])


In [8]:
import seaborn as sns

# Calculate Pearson correlation coefficient
pearson_corr = data.corr(method='pearson')

# Calculate Spearman's rank coefficient
spearman_corr = data.corr(method='spearman')

# Plot correlation matrix heatmap
sns.heatmap(pearson_corr, annot=True, cmap='coolwarm')


ValueError: could not convert string to float: 'a'

In [9]:
from sklearn.decomposition import PCA

# Initialize PCA
pca = PCA(n_components=2)

# Fit and transform the data
principal_components = pca.fit_transform(data.drop("Target", axis=1))


KeyError: "['Target'] not found in axis"

In [10]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Initialize LDA
lda = LinearDiscriminantAnalysis(n_components=2)

# Fit and transform the data
lda_components = lda.fit_transform(data.drop("Target", axis=1), data["Target"])


KeyError: "['Target'] not found in axis"