In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from scipy.stats import chi2_contingency
import warnings
warnings.filterwarnings("ignore")

def calculate_correlation(feature):
    correlation_scores[feature.name] = feature.corr(target_variable_series)

def calculate_information_gain(feature):
    if feature.dtype != object:
        reshaped_feature = feature.values.reshape(-1, 1)
        reshaped_target_variable = target_variable_series.values.reshape(-1, 1)
        information_gain_scores[feature.name] = mutual_info_classif(reshaped_feature, reshaped_target_variable)
    else:
        information_gain_scores[feature.name] = 0.0

def calculate_chi2(feature):
    contingency_table = pd.crosstab(features[feature.name], target_variable)
    chi2, _, _, _ = chi2_contingency(contingency_table.values)
    chi2_scores[feature.name] = chi2

# Load dataset
df = pd.read_csv("https://github.com/andvise/DM_Assignment/blob/main/train_data.csv?raw=true.csv")

# Separate features and target variable
features = df.drop("label", axis=1)
target_variable = df["label"]

# Label Encoding (if target variable represents categories)
if target_variable.dtype == object:
    le = LabelEncoder()
    target_variable_encoded = le.fit_transform(target_variable)
    target_variable_series = pd.Series(target_variable_encoded)

# Feature Importance Scores (Dictionaries)
correlation_scores = {}
information_gain_scores = {}
chi2_scores = {}

# Calculate scores for all features
for col in features.columns:
    calculate_correlation(features[col])
    calculate_information_gain(features[col])
    if col in features.columns:
        calculate_chi2(features[col])

# Combine scores
combined_scores = {k: correlation_scores[k] + information_gain_scores[k] for k in correlation_scores}

# Get max score for threshold
max_score = max([score[0] for score in combined_scores.values()])
values = np.arange(0, max_score - 0.005, 0.04)

output = []

# Preprocess features
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(features)

for i in values:
    threshold = i
    important_features = [f for f, score in combined_scores.items() if score >= threshold]
    X = X_scaled[:, [features.columns.get_loc(f) for f in important_features]]
    y = target_variable

    # Define the Random Forest model with hyperparameters
    rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Define the KFold cross-validation
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Create the GridSearchCV object with hyperparameter tuning
    grid_search = GridSearchCV(estimator=rf, param_grid={}, cv=cv, scoring='accuracy')

    # Fit the grid search to the data
    grid_search.fit(X, y)

    # Get the best CV results
    best_accuracy = grid_search.best_score_
    output.append([threshold, len(important_features), best_accuracy, grid_search.cv_results_['mean_test_score']])

best_accuracy = max([f[2] for f in output])
print(f'\n\nBest Accuracy = {best_accuracy}\n\n')