<a href="https://colab.research.google.com/github/741yagna/ml/blob/main/day1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
challenge_data = {
    'temperature': [98.6, 101.2, 99.5, np.nan, 97.8, 102.1, 98.9, np.nan, 100.5, 99.1],
    'blood_pressure': ['120/80', '130/85', '115/75', np.nan, '125/82', '140/90', '118/78', '128/84', np.nan, '122/79'],
    'patient_group': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B', 'A', np.nan],
    'treatment_days': [5, 10, 3, 7, np.nan, 12, 4, 8, 6, 9]
}

challenge_df = pd.DataFrame(challenge_data)
print("🎯 CHALLENGE DATASET:")
print(challenge_df)
df = pd.DataFrame(data)
print("🔍 RAW DATA:")
print(df)
print("\n📋 Basic Info:")
print(df.info())

🎯 CHALLENGE DATASET:
   temperature blood_pressure patient_group  treatment_days
0         98.6         120/80             A             5.0
1        101.2         130/85             B            10.0
2         99.5         115/75             A             3.0
3          NaN            NaN             C             7.0
4         97.8         125/82             B             NaN
5        102.1         140/90             A            12.0
6         98.9         118/78             C             4.0
7          NaN         128/84             B             8.0
8        100.5            NaN             A             6.0
9         99.1         122/79           NaN             9.0


In [None]:
print("❓ MISSING VALUES:")
print(challenge_df.isnull().sum())

print("\n📊 DATA TYPES:")
print(challenge_df.dtypes)

❓ MISSING VALUES:
temperature       2
blood_pressure    2
patient_group     1
treatment_days    1
dtype: int64

📊 DATA TYPES:
temperature       float64
blood_pressure     object
patient_group      object
treatment_days    float64
dtype: object


In [None]:
# Impute missing values in 'blood_pressure' with the mode
imputer_bp = SimpleImputer(strategy='most_frequent')
challenge_df['blood_pressure'] = imputer_bp.fit_transform(challenge_df[['blood_pressure']]).flatten()

print("\n✅ MISSING VALUES AFTER IMPUTATION (BLOOD PRESSURE):")
print(challenge_df.isnull().sum())


✅ MISSING VALUES AFTER IMPUTATION (BLOOD PRESSURE):
temperature       0
blood_pressure    0
patient_group     1
treatment_days    1
dtype: int64


In [None]:
# Impute missing values in 'temperature' with the mean
imputer_temp = SimpleImputer(strategy='mean')
challenge_df['temperature'] = imputer_temp.fit_transform(challenge_df[['temperature']])

print("\n✅ MISSING VALUES AFTER IMPUTATION (TEMPERATURE):")
print(challenge_df.isnull().sum())


✅ MISSING VALUES AFTER IMPUTATION (TEMPERATURE):
temperature       0
blood_pressure    2
patient_group     1
treatment_days    1
dtype: int64


In [None]:
categorical_cols = challenge_df.select_dtypes(include=['object']).columns
print("📝 CATEGORICAL COLUMNS:", list(categorical_cols))

📝 CATEGORICAL COLUMNS: ['blood_pressure', 'patient_group']


In [None]:
# Select numerical features to scale
# Based on the current state of challenge_df after one-hot encoding and imputation
numerical_features = ['temperature', 'treatment_days', 'patient_group_A', 'patient_group_B', 'patient_group_C']

# Method 1: Standardization (mean=0, std=1) - MOST COMMON
scaler = StandardScaler()
# Applying fit_transform again to already scaled data for demonstration purposes as requested by the user's example
challenge_df[numerical_features] = scaler.fit_transform(challenge_df[numerical_features])


print("⚖️ AFTER STANDARDIZATION:")
print(challenge_df[numerical_features].describe())

⚖️ AFTER STANDARDIZATION:
        temperature  treatment_days  patient_group_A  patient_group_B  \
count  1.000000e+01    1.000000e+01        10.000000     1.000000e+01   
mean   1.110223e-17    1.110223e-17         0.000000    -2.220446e-17   
std    1.054093e+00    1.054093e+00         1.054093     1.054093e+00   
min   -1.587766e+00   -1.566335e+00        -1.000000    -6.546537e-01   
25%   -6.330308e-01   -7.090840e-01        -1.000000    -6.546537e-01   
50%   -8.820921e-02   -2.116669e-02         0.000000    -6.546537e-01   
75%    4.903394e-01    6.244173e-01         1.000000     9.819805e-01   
max    1.982113e+00    1.862669e+00         1.000000     1.527525e+00   

       patient_group_C  
count        10.000000  
mean          0.000000  
std           1.054093  
min          -0.500000  
25%          -0.500000  
50%          -0.500000  
75%          -0.500000  
max           2.000000  


In [None]:
# Identify numerical columns
numerical_cols = challenge_df.select_dtypes(include=np.number).columns

# Apply StandardScaler to numerical columns
scaler = StandardScaler()
challenge_df[numerical_cols] = scaler.fit_transform(challenge_df[numerical_cols])

print("\n📊 DataFrame after Feature Scaling:")
display(challenge_df.head())


📊 DataFrame after Feature Scaling:


Unnamed: 0,temperature,blood_pressure,treatment_days,patient_group_A,patient_group_B,patient_group_C
0,-0.9236023,120/80,-0.8043341,1.0,-0.654654,-0.5
1,1.234929,130/85,1.100668,-1.0,1.527525,-0.5
2,-0.1764184,115/75,-1.566335,1.0,-0.654654,-0.5
3,-1.179791e-14,115/75,-0.04233338,-1.0,-0.654654,2.0
4,-1.587766,125/82,-3.383963e-16,-1.0,1.527525,-0.5


In [None]:
# Impute missing values in 'treatment_days' with the mean
imputer_days = SimpleImputer(strategy='mean')
challenge_df['treatment_days'] = imputer_days.fit_transform(challenge_df[['treatment_days']])

print("\n✅ MISSING VALUES AFTER IMPUTATION (TREATMENT DAYS):")
print(challenge_df.isnull().sum())


✅ MISSING VALUES AFTER IMPUTATION (TREATMENT DAYS):
temperature        0
blood_pressure     0
treatment_days     0
patient_group_A    0
patient_group_B    0
patient_group_C    0
dtype: int64


In [None]:
# Create dummy variables for 'blood_pressure'
challenge_df_encoded = pd.get_dummies(challenge_df, columns=['blood_pressure'], prefix='bp')

print("🔥 ONE-HOT ENCODED DATA (including blood_pressure):")
display(challenge_df_encoded.head())
print("\nRemaining missing values:")
print(challenge_df_encoded.isnull().sum())

🔥 ONE-HOT ENCODED DATA (including blood_pressure):


Unnamed: 0,temperature,treatment_days,patient_group_A,patient_group_B,patient_group_C,bp_115/75,bp_118/78,bp_120/80,bp_122/79,bp_125/82,bp_128/84,bp_130/85,bp_140/90
0,98.6,5.0,1.0,0.0,0.0,False,False,True,False,False,False,False,False
1,101.2,10.0,0.0,1.0,0.0,False,False,False,False,False,False,True,False
2,99.5,3.0,1.0,0.0,0.0,True,False,False,False,False,False,False,False
3,99.7125,7.0,0.0,0.0,1.0,True,False,False,False,False,False,False,False
4,97.8,,0.0,1.0,0.0,False,False,False,False,True,False,False,False



Remaining missing values:
temperature        0
treatment_days     1
patient_group_A    0
patient_group_B    0
patient_group_C    0
bp_115/75          0
bp_118/78          0
bp_120/80          0
bp_122/79          0
bp_125/82          0
bp_128/84          0
bp_130/85          0
bp_140/90          0
dtype: int64


In [None]:
# Impute missing values in 'patient_group' with the mode before encoding
imputer_group = SimpleImputer(strategy='most_frequent')
challenge_df['patient_group'] = imputer_group.fit_transform(challenge_df[['patient_group']]).flatten()

# Perform one-hot encoding on 'patient_group'
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_patient_group = encoder.fit_transform(challenge_df[['patient_group']])
encoded_patient_group_df = pd.DataFrame(encoded_patient_group, columns=encoder.get_feature_names_out(['patient_group']))

# Drop the original 'patient_group' column and concatenate the encoded columns
challenge_df = challenge_df.drop('patient_group', axis=1)
challenge_df = pd.concat([challenge_df, encoded_patient_group_df], axis=1)

print("\nTransformed DataFrame after One-Hot Encoding 'patient_group':")
display(challenge_df.head())
print("\nRemaining missing values:")
print(challenge_df.isnull().sum())


Transformed DataFrame after One-Hot Encoding 'patient_group':


Unnamed: 0,temperature,blood_pressure,treatment_days,patient_group_A,patient_group_B,patient_group_C
0,98.6,120/80,5.0,1.0,0.0,0.0
1,101.2,130/85,10.0,0.0,1.0,0.0
2,99.5,115/75,3.0,1.0,0.0,0.0
3,99.7125,115/75,7.0,0.0,0.0,1.0
4,97.8,125/82,,0.0,1.0,0.0



Remaining missing values:
temperature        0
blood_pressure     0
treatment_days     1
patient_group_A    0
patient_group_B    0
patient_group_C    0
dtype: int64


In [None]:
from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()
# df_encoded[numerical_features] = minmax_scaler.fit_transform(df_encoded[numerical_features])

In [None]:
# Assuming all columns in challenge_df are features for demonstration purposes
X = challenge_df

# Since there's no explicit target, we'll create a dummy target for the split
# In a real scenario, you would define your actual target variable
y = pd.Series([0] * len(X)) # Dummy target

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"📊 DATA SPLIT:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Features: {X_train.shape[1]} columns")

📊 DATA SPLIT:
Training set: 8 samples
Test set: 2 samples
Features: 6 columns


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Define numerical and categorical features based on challenge_df
numerical_features = ['temperature', 'treatment_days']
categorical_features = ['blood_pressure'] # Treating blood_pressure as categorical for this pipeline

# Create preprocessing pipelines for numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), # Using mean as done previously for temperature and treatment_days
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), # Using most_frequent as done previously for blood_pressure
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines using ColumnTransformer
# Exclude the already processed patient_group columns from this transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='passthrough' # Keep the other columns (like the one-hot encoded patient_group)
)

print("🏗️ PREPROCESSING PIPELINE CREATED!")

🏗️ PREPROCESSING PIPELINE CREATED!
