In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Step 1: Data Gathering and Cleaning
df1 = pd.read_csv('dataset1.csv')
df2 = pd.read_csv('dataset2.csv')
# ... Load other datasets

# Data cleaning steps as needed
# ...

# Step 2: Data Preprocessing
df = pd.concat([df1, df2], axis=0)  # Merge datasets if needed

# Handle missing values
df = df.dropna()

#check for outliers
for col in numerical_features:
  sns.boxplot(data = df, x = col  )
  plt.show()


# Handle outliers
column_name = 'feature_data'
z_scores = np.abs((df[column_name] - df[column_name].mean()) / df[column_name].std())
threshold = 3
filtered_df = df[z_scores < threshold]


# Feature engineering
df['new_feature'] = df['feature1'] + df['feature2']
df = df.drop(['irrelevant_feature1', 'irrelevant_feature2'], axis=1)

df = df.drop_duplicates()

#drop highly correlated features
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))

correlated_to_drop = []
for col in upper.columns:
    if any(upper[col] > 0.9):
        correlated_to_drop.append(col)
        
nba_reg_new.drop(correlated_to_drop, axis=1, inplace=True)


# Select numerical features
numerical_features = df.select_dtypes(include=['number'])

# Select categorical features
categorical_features = df.select_dtypes(include=['object'])

# Define the numerical and categorical feature column names
num_features = numerical_features.columns.tolist()
cat_features = categorical_features.columns.tolist()


# Split the data into training and testing sets
X = df.drop('target_variable', axis=1)
y = df['target_variable']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Split the data into training and test sets
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=0.2, random_state=0)



# Create the column transformer for preprocessing
preprocess = make_column_transformer(
    (StandardScaler(), num_features),
    (OneHotEncoder(handle_unknown='ignore'), cat_features),
    
)

# Fit and transform the training data
X_dev_preprocessed = preprocess.fit_transform(X_dev)

# Transform the test data using the fitted transformer
X_test_preprocessed = preprocess.transform(X_test)



# Step 3: Data Visualization
# Explore the data through visualizations
# ...

# Step 4: Trying Multiple ML Models and Hyperparameter Tuning
# Instantiate the models
logreg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()

# Define the hyperparameter grids to search over
logreg_params = {'C': [0.1, 1, 10]}
dtree_params = {'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10]}
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}

# Perform hyperparameter tuning using GridSearchCV
logreg_grid = GridSearchCV(logreg, logreg_params, cv=5)
logreg_grid.fit(X_dev_preprocessed, y_train)

dtree_grid = GridSearchCV(dtree, dtree_params, cv=5)
dtree_grid.fit(X_train, y_train)

rf_grid = GridSearchCV(rf, rf_params, cv=5)
rf_grid.fit(X_train, y_train)

# Print the best hyperparameters and evaluation scores
print("Logistic Regression:")
print("Best Hyperparameters:", logreg_grid.best_params_)
print("Cross-validation Score:", logreg_grid.best_score_)

print("Decision Tree:")
print("Best Hyperparameters:", dtree_grid.best_params_)
print("Cross-validation Score:", dtree_grid.best_score_)

print("Random Forest:")
print("Best Hyperparameters:", rf_grid.best_params_)
print("Cross-validation Score:", rf_grid.best_score_)

# Select the best-performing model based on the evaluation results
best_model = rf_grid.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate the best model on the test set
accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", accuracy)