In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# --- 1. Load Data from a Link ---
# Replace this URL with the link to your dataset.
# This example uses the Titanic dataset.
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)

print("--- First 5 rows of the loaded data ---")
print(df.head())

--- First 5 rows of the loaded data ---
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0       

In [3]:
# --- 2. Adapt for the New Dataset ---
# IMPORTANT: You must update these variables to match your dataset's columns.

# Define the target variable and the features to use
target_column = 'Survived'
features_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin'] # Drop identifiers or high-cardinality features

# Separate features (X) and target variable (y)
X = df.drop([target_column] + features_to_drop, axis=1)
y = df[target_column]

# Identify column types from YOUR dataset
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch']
categorical_features = ['Pclass', 'Sex', 'Embarked']

print("\n--- Features being used for analysis ---")
print(X.head())




--- Features being used for analysis ---
   Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0       3    male  22.0      1      0   7.2500        S
1       1  female  38.0      1      0  71.2833        C
2       3  female  26.0      0      0   7.9250        S
3       1  female  35.0      1      0  53.1000        S
4       3    male  35.0      0      0   8.0500        S


In [4]:
# --- 3. Data Preprocessing (This part remains mostly the same) ---

# Create a preprocessing pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

# Create a preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply the preprocessing steps
X_processed = preprocessor.fit_transform(X)

# Get the new feature names after one-hot encoding
new_categorical_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
all_feature_names = numerical_features + list(new_categorical_features)

print("\n--- Shape of data after preprocessing ---")
print("Processed data shape:", X_processed.shape)



--- Shape of data after preprocessing ---
Processed data shape: (891, 12)


In [5]:
# --- 4. Apply Feature Selection Method (Chi-Squared Test) ---
# Select the 6 best features from the processed data.
k_best_features = 6
selector = SelectKBest(score_func=chi2, k=k_best_features)
X_new = selector.fit_transform(X_processed, y)

# Get the names of the selected features
selected_indices = selector.get_support(indices=True)
selected_features = [all_feature_names[i] for i in selected_indices]

print("\n--- Feature Selection Results ---")
print(f"Original number of features: {X_processed.shape[1]}")
print(f"Reduced number of features: {X_new.shape[1]}")
print(f"Selected features: {selected_features}")


--- Feature Selection Results ---
Original number of features: 12
Reduced number of features: 6
Selected features: ['Fare', 'Pclass_1', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C']


In [6]:
# Final data ready for a model
final_df = pd.DataFrame(X_new, columns=selected_features)
print("\n--- Final Data Ready for a Model (Top 5 rows) ---")
print(final_df.head())


--- Final Data Ready for a Model (Top 5 rows) ---
       Fare  Pclass_1  Pclass_3  Sex_female  Sex_male  Embarked_C
0  0.014151       0.0       1.0         0.0       1.0         0.0
1  0.139136       1.0       0.0         1.0       0.0         1.0
2  0.015469       0.0       1.0         1.0       0.0         0.0
3  0.103644       1.0       0.0         1.0       0.0         0.0
4  0.015713       0.0       1.0         0.0       1.0         0.0
