|key|value|
|----|-----|
|Name:|M.Hamza|
|CMS ID:|407251|
|Course:|Machine Learning CS-470|
|Lab:|13|

In [None]:
!pip install numpy==1.26.4

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Hands On Feature Engineering

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Example Dataset
data = {'Age': [25, None, 35, 29], 'Salary': [50000, 54000, None, 58000]}
df = pd.DataFrame(data)
df

## Imputation

### Numerical Imputation with Mean

In [None]:
# Numerical Imputation with Mean
imputer = SimpleImputer(strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])
df

### Categorical Imputation (replace NaN with 'Unknown')

In [None]:
# Categorical Imputation (replace NaN with 'Unknown')
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
df

## Encoding

### One Hot Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

# Example Dataset
data = {'City': ['London', 'Paris', 'Berlin']}
df = pd.DataFrame(data)

# One-Hot Encoding
one_hot = pd.get_dummies(df['City'])
one_hot

### Label Encoding

In [None]:
# Label Encoding
le = LabelEncoder()
df['City_Label'] = le.fit_transform(df['City'])
df

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Example Dataset
data = {'Age': [25, 35, 29], 'Salary': [50000, 54000, 58000]}
df = pd.DataFrame(data)

### Standardization

In [None]:
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])
df

### Normalization

In [None]:
normalizer = MinMaxScaler()
df[['Age', 'Salary']] = normalizer.fit_transform(df[['Age', 'Salary']])
df

## Creating Interaction Features

In [None]:
data = {'Bedrooms': [3, 4, 2], 'House_Size': [1000, 1200, 800]}
df = pd.DataFrame(data)

# Interaction Feature
df['Rooms_per_Square_Meter'] = df['Bedrooms'] / df['House_Size']
df

## Log Transformation

In [None]:
import numpy as np

data = {'Income': [1000, 10000, 50000, 100000]}
df = pd.DataFrame(data)

df['Log_Income'] = np.log1p(df['Income'])
df

## Polynomial Features

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Example Dataset
data = {'Feature': [2, 3, 4]}
df = pd.DataFrame(data)

# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
polynomial_features = poly.fit_transform(df[['Feature']])
polynomial_features

## Binning

In [None]:
# Example Dataset
data = {'Age': [25, 35, 29, 50, 60]}
df = pd.DataFrame(data)

# Binning
bins = [0, 30, 50, 100]
labels = ['Young', 'Middle-Aged', 'Senior']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)
df

# Hands On With Feature Selection

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
data = pd.read_csv("students_performance.csv")

# Filter numeric columns
numeric_data = data.select_dtypes(include=["number"])

# Compute correlation matrix
correlation_matrix = numeric_data.corr()

# Visualize the matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

### Drop Feature with high correlation (> 0.85)

In [None]:
threshold = 0.85
high_corr_pairs = [
    (col1, col2)
    for col1 in correlation_matrix.columns
        for col2 in correlation_matrix.columns
            if abs(correlation_matrix[col1][col2]) > threshold and col1 != col2
]
# Output the highly correlated pairs
print("Highly correlated pairs (|correlation| > 0.85):")
for col1, col2 in high_corr_pairs:
    print(f"{col1} - {col2}: {correlation_matrix[col1][col2]:.2f}")

## Univariate Features

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Check column names
print("Dataset columns:", data.columns)

# Ensure 'passed_all' exists (modify this logic as per your requirement)
if "passed_all" not in data.columns:
    # Example logic: Passed all exams if scores are >= 50
    data["passed_all"] = (data["math score"] >= 50) & (data["reading score"] >= 50) & (data["writing score"] >= 50)

# Encode categorical columns using one-hot encoding
categorical_columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Define features and target
X = data_encoded.drop(columns=["passed_all"])
y = data_encoded["passed_all"]

# Apply ANOVA F-test
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X, y)

# Display selected feature scores
selected_features = X.columns[selector.get_support()]
print("Selected Features:", list(selected_features))

## Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Initialize model
model = RandomForestClassifier()

# Apply RFE
rfe_selector = RFE(estimator=model, n_features_to_select=5, step=1)
X_rfe = rfe_selector.fit_transform(X, y)

# Get selected features
selected_rfe_features = X.columns[rfe_selector.support_]
print("RFE Selected Features:", list(selected_rfe_features))

## Feature importance from tree based model

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Train Random Forest model
model = RandomForestClassifier()
model.fit(X, y)

# Plot feature importance
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.barh(X.columns[indices], importances[indices])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importance from Random Forest")
plt.show()

## Regularization

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Apply Lasso for feature selection
lasso = Lasso(alpha=0.01)
lasso.fit(X, y)

# Select features with non-zero coefficients
lasso_selector = SelectFromModel(lasso, prefit=True)
X_lasso = lasso_selector.transform(X)

# Get selected features
selected_lasso_features = X.columns[lasso_selector.get_support()]
print("Lasso Selected Features:", list(selected_lasso_features))

# Visualizing Feature Importance

In [None]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
model = RandomForestClassifier()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_
features = X.columns
# Plot
plt.figure(figsize=(10, 6))
plt.barh(features, importances, color="skyblue")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.title("Feature Importance (Random Forest)")
plt.show()

## Shap Summary Plot

In [None]:
import shap

# Explain model predictions
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# SHAP summary plot
shap.summary_plot(shap_values, X)

## Permutation Importance

In [None]:
from sklearn.inspection import permutation_importance

# Calculate permutation importance
perm_importance = permutation_importance(model, X, y, scoring="accuracy")

# Plot
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(features[sorted_idx], perm_importance.importances_mean[sorted_idx], 
color="lightcoral")
plt.xlabel("Permutation Importance")
plt.ylabel("Feature")
plt.title("Permutation Feature Importance")
plt.show()

# Mini Challenge

In [None]:
dataset = pd.read_csv('archive/Weather Training Data.csv')
dataset.head()

## Feature Engineering

### Remove unnecessary Features

In [None]:
dataset = dataset.drop(columns=['row ID'])
dataset.head()

### Handling missing values

In [None]:
dataset.shape

In [None]:
dataset.isna().sum()

Here as we can see, we have some features with zero missing values. some with under 10k and some with above 10k.

Features that have missing values less than 10k, we will be imputing those values. For the remaining values, I will train a random forest model to predict them.

In [None]:
okay_features = ['Location', 'RainTomorrow']
cols_with_missing_values = [col for col in dataset.columns if col not in okay_features]
over_10k = ['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm']
under_10k = [v for v in dataset.columns if v not in okay_features and v not in over_10k]
under_10k

#### Remove rows with all missing values

In [None]:
print(dataset.shape)
dataset = dataset[~dataset.isna()[cols_with_missing_values].all(axis=1)]
dataset.shape

#### Remove rows that has missing values more than or equal to 5

In [None]:
# removing all the records that has missing values of 5 or more
dataset = dataset.iloc[(dataset.isna()[cols_with_missing_values].sum(axis=1) < 5).values, :]

In [None]:
# Checking the missing values again
dataset.isna().sum()

### Imputing under 10k features

In [None]:
for col in under_10k:
    if dataset[col].dtype == 'float64':
        dataset[col] = dataset[col].fillna(dataset[col].median())
    else:
        dataset[col] = dataset[col].fillna(dataset[col].mode()[0])

dataset.isna().sum()

### Imputing features with over 10k values

In [None]:
dataset[~(dataset[over_10k].isna())].shape

In [None]:
['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm']
dataset = dataset[~(dataset['Evaporation'].isna() & dataset['Sunshine'].isna() & dataset['Cloud9am'].isna() & dataset['Cloud3pm'].isna())]
dataset.isna().sum()

In [None]:
dataset.shape

#### Imputing the remaining values in the dataset

In [None]:
for col in over_10k:
    if dataset[col].dtype == 'float64':
        dataset[col] = dataset[col].fillna(dataset[col].median())
    else:
        dataset[col] = dataset[col].fillna(dataset[col].mode()[0])

dataset.isna().sum()

### One hot encoding the categorical columns

In [None]:
categorical_columns = [col for col in dataset.columns if dataset[col].dtype == 'object']
dataset = pd.get_dummies(
    dataset, 
    columns=categorical_columns,
    drop_first=True
)

In [None]:
dataset.columns

In [None]:
dataset.head()

### Feature Interaction

In [None]:
dataset['MaxTemp*Evaporation'] = dataset['MaxTemp'] * dataset['Evaporation']
dataset.head()

### Standardization

In [None]:
from sklearn.preprocessing import StandardScaler


numerical_columns = [col for col in dataset.columns if dataset[col].dtype == 'float64']
numerical_columns

In [None]:
dataset[numerical_columns].head()

In [None]:
# We standardize the numerical columns
scaler = StandardScaler()
dataset[numerical_columns] = scaler.fit_transform(dataset[numerical_columns])
dataset[numerical_columns].head()

In [None]:
dataset.head()

## Feature Selection

### Correlation Matrix

In [None]:
# plot the correlation matrix of numerical columns and the target variable
plt.figure(figsize=(10, 8))
sns.heatmap(dataset[numerical_columns + ['RainTomorrow']].corr(), annot=True, cmap="coolwarm", fmt=".2f")

### Drop highly correlated features

In [None]:
highly_correlated_features = []
correlation_matrix = dataset[numerical_columns].corr()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.85:
            colname = correlation_matrix.columns[i]
            highly_correlated_features.append(colname)

highly_correlated_features

In [None]:
dataset = dataset.drop(columns=highly_correlated_features)
dataset.head()

### Using Recursive Feature Elimination

In [None]:
# use recursive feature elimination with cross-validation to select the best features
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

X = dataset.drop(columns=['RainTomorrow'])
y = dataset['RainTomorrow']

# use 50% of the data for training data
X = X.sample(frac=0.5, random_state=42)
y = y.loc[X.index]

model = RandomForestClassifier(random_state=42)
selector = RFECV(model, step=1, cv=3, n_jobs=-1)
selector = selector.fit(X, y)

selected_features = X.columns[selector.support_]

selected_features

#### Features that were not selected

In [None]:
set(X.columns) - set(selected_features)

### Using Feature Importance

In [None]:
# plot the feature importances
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

importances = model.feature_importances_
indices = np.argsort(importances)[::]

plt.figure(figsize=(10, 18))
# feature importance and the importance of the feature infront of it
plt.barh(X.columns[indices], importances[indices])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importance from Random Forest')
plt.show()

### Select the best features

In [None]:
# select the best features
X_org = dataset[selected_features]
y_org = dataset['RainTomorrow']

X_org.head()

### Visualize class distribution

In [None]:
plt.figure(figsize=(6, 6))
plt.hist(y_org)
plt.xlabel('RainTomorrow')
plt.ylabel('Frequency')
plt.title('RainTomorrow Distribution')
plt.show()

## Training the model

### Reusable function for training the model

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    print("Accuracy:", accuracy)
    print("\nClassification Report:")
    print(classification_report(y, y_pred))

    plot_confusion_matrix(y, y_pred)


def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", xticklabels=["No Rain", "Rain"], yticklabels=["No Rain", "Rain"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()


def get_best_params(model, param_grid, X, y):
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1')
    grid_search.fit(X, y)
    return grid_search.best_params_


def train_and_evaluate_model(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)

    print("Training Performance:")
    evaluate_model(model, X_train, y_train)

    print("\n\nTest Performance:")
    evaluate_model(model, X_test, y_test)


### Train Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

print("Simple Logistic Regression Model:")
# give more weight to 1 class
model = LogisticRegression(class_weight={0: 1, 1: 5}, random_state=42)
train_and_evaluate_model(model, X_org, y_org)

### Train Logistic Regression with GridSearchCV

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

model = LogisticRegression(random_state=42, class_weight={0: 1, 1: 5})
best_params = get_best_params(model, param_grid, X_org, y_org)

print("Best Parameters for Logistic Regression:", best_params)

model = LogisticRegression(**best_params, random_state=42, class_weight={0: 1, 1: 5})
print("\nLogistic Regression Model with Best Parameters:")
train_and_evaluate_model(model, X_org, y_org)

### Train Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

print("Simple Random Forest Model:")
model = RandomForestClassifier(random_state=42, min_samples_split=5, n_estimators=100, max_depth=10, class_weight={0: 1, 1: 5})
train_and_evaluate_model(model, X_org, y_org)

### Train Random Forest with GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model = RandomForestClassifier(random_state=42, class_weight={0: 1, 1: 5})
best_params = get_best_params(model, param_grid, X_org, y_org)

print("Best Parameters for Random Forest:", best_params)

model = RandomForestClassifier(**best_params, random_state=42, class_weight={0: 1, 1: 5})
print("\nRandom Forest Model with Best Parameters:")
train_and_evaluate_model(model, X_org, y_org)

# Bonus

## Pair plot of the selected features

In [None]:
selected_dataset = dataset[list(set(numerical_columns) & set(selected_features)) + ['RainTomorrow']]
sns.pairplot(selected_dataset, hue='RainTomorrow')
plt.show()

## Shap to explain the model

In [None]:
# shap to explain the model predictions
import shap

model = RandomForestClassifier(**best_params, random_state=42, class_weight={0: 1, 1: 5}, n_jobs=-1)
model.fit(X_org, y_org)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_org)

shap.summary_plot(shap_values[1], X_org)

## Correlation heatmap

In [None]:
corr = selected_dataset.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

# Conclusion

In this notebook, I have performed feature engineering and feature selection on the dataset. I have imputed the missing values, encoded the categorical columns, created interaction features, standardized the data, and selected the best features. I have trained a logistic regression and random forest model on the dataset. I have also visualized the class distribution, pair plot of the selected features, and correlation heatmap. I have used SHAP to explain the model.

Other than that, I have also performed a mini challenge where I have removed unnecessary features, handled missing values, and trained a model on the dataset.