In [14]:
import pandas as pd
import numpy as np


In [3]:
import os
os.chdir("C:\\Users\\faizan\\Documents\\IMLChallenge02")

In [5]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')


In [15]:
# Keep only numeric columns for correlation analysis
train_numeric = train_df.select_dtypes(include=[np.number])

# Correlation analysis for feature selection (example using Pearson correlation)
correlation_threshold = 0.1  # Adjust this threshold based on your analysis
correlation_with_target = train_numeric.corrwith(train_df['price_doc']).abs()
relevant_features = correlation_with_target[correlation_with_target > correlation_threshold].index

In [16]:
relevant_features.shape

(255,)

In [None]:
train_missing_values = train_df[relevant_features].isnull().sum()
train_data_types = train_df[relevant_features].dtypes

test_missing_values = test_df[relevant_features].isnull().sum()
test_data_types = test_df[relevant_features].dtypes

In [7]:
train_df[train_df.isnull().any(axis=1)].head()
train_df = train_df.dropna()

test_df[test_df.isnull().any(axis=1)].head()
test_df = test_df.dropna()

In [20]:
from sklearn.impute import SimpleImputer
X_train = train_df[relevant_features].drop(['price_doc'], axis=1)
y_train = train_df[relevant_features]['price_doc']

# Separate numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=[np.number]).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Impute missing values for numerical features with median
imputer_num = SimpleImputer(strategy='mean')
X_train[numerical_cols] = imputer_num.fit_transform(X_train[numerical_cols])
test_df[numerical_cols] = imputer_num.transform(test_df[numerical_cols])

# Impute missing values for categorical features with mode
imputer_cat = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = imputer_cat.fit_transform(X_train[categorical_cols])
test_df[categorical_cols] = imputer_cat.transform(test_df[categorical_cols])

# Check if there are any missing values left
X_train.isnull().sum().sum(), test_df.isnull().sum().sum()

ValueError: at least one array or dtype is required

In [105]:
from sklearn.impute import KNNImputer, SimpleImputer

X_train = train_df.drop(['hospital_death'], axis=1)
y_train = train_df['hospital_death']

# Identify numerical and categorical columns
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
num_cols = num_cols[num_cols != 'RecordID']
cat_cols = X_train.select_dtypes(include=['object']).columns

# Impute missing values for numerical features with KNN imputer
knn_imputer = KNNImputer(n_neighbors=300)  # You can adjust the number of neighbors
X_train[num_cols] = knn_imputer.fit_transform(X_train[num_cols])
test_df[num_cols] = knn_imputer.transform(test_df[num_cols])


# Impute missing values for categorical features with mode
imputer_cat = SimpleImputer(strategy='most_frequent')
X_train[cat_cols] = imputer_cat.fit_transform(X_train[cat_cols])
test_df[cat_cols] = imputer_cat.transform(test_df[cat_cols])

# Check if there are any missing values left
X_train.isnull().sum().sum(), test_df.isnull().sum().sum()


(0, 0)

In [10]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Scale the numerical columns using StandardScaler
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

In [45]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

In [11]:
# Perform one-hot encoding for categorical features
X_train_encoded = pd.get_dummies(X_train, columns=cat_cols)
test_df_encoded = pd.get_dummies(test_df, columns=cat_cols)

# Ensure that both training and test data have the same set of columns after one-hot encoding
missing_cols = set(X_train_encoded.columns) - set(test_df_encoded.columns)
for col in missing_cols:
    test_df_encoded[col] = 0

test_df_encoded = test_df_encoded[X_train_encoded.columns]

In [21]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Perform Standard Scaling on numerical columns
scaler = StandardScaler()
X_train_encoded[num_cols] = scaler.fit_transform(X_train_encoded[num_cols])
test_df_encoded[num_cols] = scaler.transform(test_df_encoded[num_cols])

# Perform PCA on numerical columns
pca = PCA(n_components=40)  # You can adjust the number of components as needed
X_train_pca = pca.fit_transform(X_train_encoded[num_cols])
test_df_pca = pca.transform(test_df_encoded[num_cols])

In [128]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

cat_cols = X_train.select_dtypes(include=['object']).columns

# Apply label encoding to categorical columns in both train and test data
for col in cat_cols:
    X_train[col] = label_encoder.fit_transform(X_train[col])
    test_df[col] = label_encoder.transform(test_df[col])


# Ensure that both training and test data have the same set of columns after one-hot encoding
missing_cols = set(X_train_encoded.columns) - set(test_df_encoded.columns)
for col in missing_cols:
    test_df_encoded[col] = 0

test_df_encoded = test_df_encoded[X_train_encoded.columns]

In [58]:
from sklearn.feature_selection import SelectKBest, f_classif

# Define the number of top features to select (you can adjust this)
num_features_to_select = 10

# Initialize a SelectKBest object with the f_classif score function (you can choose another scoring function)
selector = SelectKBest(score_func=f_classif, k=num_features_to_select)

# Fit the selector to your training data
X_train_selected = selector.fit_transform(X_train_encoded, y_train)

# Get the indices of the selected features
selected_feature_indices = selector.get_support(indices=True)

# Get the names of the selected features
selected_feature_names = X_train_encoded.columns[selected_feature_indices]

# Print the selected feature names
print("Selected Features:", selected_feature_names)

# Use only the selected features for training and testing
X_train_selected = X_train_encoded[selected_feature_names]
test_df_selected = test_df_encoded[selected_feature_names]

Selected Features: Index(['gcs_eyes_apache', 'gcs_motor_apache', 'gcs_verbal_apache',
       'ventilated_apache', 'd1_spo2_min', 'd1_sysbp_min',
       'd1_sysbp_noninvasive_min', 'd1_temp_min',
       'apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob'],
      dtype='object')


In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier

# Initialize RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=600, random_state=42)

# Initialize SequentialFeatureSelector for feature selection (forward selection)
feature_selector = SequentialFeatureSelector(rf_model,
                                             k_features='best',
                                             forward=True,
                                             scoring='roc_auc',
                                             cv=3)

# Fit feature selector on the training data
feature_selector.fit(X_train_encoded, y_train)

# Get the selected feature indices
selected_feature_indices = feature_selector.k_feature_idx_

# Select the corresponding features from the training and test data
X_train_selected = X_train_encoded.iloc[:, list(selected_feature_indices)]
test_df_selected = test_df_encoded.iloc[:, list(selected_feature_indices)]

(181507, 2214)

In [12]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators = 100)
rf_model.fit(X_train_encoded, y_train)

KeyboardInterrupt: 

In [23]:
from sklearn.metrics import roc_auc_score

# Predict the probabilities of class being 1 for the training set
y_train_prob = rf_model.predict_proba(X_train_encoded)[:, 1]

# Evaluate the ROC AUC score on the training data
roc_auc_train = roc_auc_score(y_train, y_train_prob)
roc_auc_train

# Predict the probabilities of class being 1 for the test set
y_test_prob = rf_model.predict_proba(test_df_encoded)[:, 1]

In [24]:
roc_auc_train

1.0

In [12]:
# Prepare the submission dataframe
submission_df = pd.DataFrame({
    'RecordID': test_df['RecordID'],
    'hospital_death': y_test_prob
})

In [13]:
# Display the first few rows of the submission dataframe
submission_df.head()
# Save the submission dataframe to a CSV file
submission_df.to_csv('prediction_rf.csv', index=False)

In [14]:
submission_df.shape

(30000, 2)

In [15]:
del train_df
del test_df