In [None]:
import pandas as pd

file_path = 'housing.csv'
df = pd.read_csv(file_path)

print("\nDataset Information:")
print(df.info())

print("\nStatistical Information of Numerical Columns:")
print(df.describe())

print("\nUnique Labels Count for 'Ocean Proximity' column:")
print(df['ocean_proximity'].value_counts())

print("\nAttributes with Missing Values:")
missing_values = df.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0]
print(columns_with_missing_values)



Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None

Statistical Information of Numerical Columns:
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861  

In [11]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

# Load the dataset
file_path = "diabetes.csv"
df = pd.read_csv(file_path)

# 1. Identify missing values
missing_values = df.isnull().sum()
missing_columns = missing_values[missing_values > 0].index.tolist()
print(f"Columns with missing values: {missing_columns}")

# Handle missing values (Mean imputation for numeric columns)
df_numeric = df.select_dtypes(include=['number']).copy()
imputer = SimpleImputer(strategy="mean")
df_numeric.iloc[:, :] = imputer.fit_transform(df_numeric)
df[df_numeric.columns] = df_numeric

# 2. Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

# Encode categorical columns using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for inverse transform if needed

# 3. Remove outliers using IQR method
Q1 = df_numeric.quantile(0.25)
Q3 = df_numeric.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df_numeric < (Q1 - 1.5 * IQR)) | (df_numeric > (Q3 + 1.5 * IQR))).any(axis=1)]

# 4. Apply Min-Max Scaling and Standardization
min_max_scaler = MinMaxScaler()
df_minmax = pd.DataFrame(min_max_scaler.fit_transform(df_numeric), columns=df_numeric.columns)

standard_scaler = StandardScaler()
df_standard = pd.DataFrame(standard_scaler.fit_transform(df_numeric), columns=df_numeric.columns)

# Display Results
print("\nProcessed Diabetes Dataset (Min-Max Scaled):")
print(df_minmax.head())

print("\nProcessed Diabetes Dataset (Standard Scaled):")
print(df_standard.head())


Columns with missing values: []
Categorical columns: ['Gender', 'CLASS']

Processed Diabetes Dataset (Min-Max Scaled):
         ID  No_Pation       AGE      Urea        Cr     HbA1c      Chol  \
0  0.627034   0.000237  0.508475  0.109375  0.050378  0.264901  0.407767   
1  0.918648   0.000452  0.101695  0.104167  0.070529  0.264901  0.359223   
2  0.524406   0.000634  0.508475  0.109375  0.050378  0.264901  0.407767   
3  0.849812   0.001160  0.508475  0.109375  0.050378  0.264901  0.407767   
4  0.629537   0.000452  0.220339  0.171875  0.050378  0.264901  0.475728   

         TG       HDL       LDL      VLDL       BMI  
0  0.044444  0.226804  0.114583  0.011461  0.173913  
1  0.081481  0.092784  0.187500  0.014327  0.139130  
2  0.044444  0.226804  0.114583  0.011461  0.173913  
3  0.044444  0.226804  0.114583  0.011461  0.173913  
4  0.051852  0.061856  0.177083  0.008596  0.069565  

Processed Diabetes Dataset (Standard Scaled):
         ID  No_Pation       AGE      Urea        Cr 

In [12]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

# Step 2: Load the dataset (Ensure it's in the working directory)
file_path = "adult.csv"  # Update the file path if needed
df = pd.read_csv(file_path)

# Step 3: Handling Missing Values (Replacing '?' with NaN)
df.replace("?", np.nan, inplace=True)

# Step 4: Check missing values before imputation
missing_values_before = df.isna().sum()
print("\nColumns with missing values before imputation:")
print(missing_values_before[missing_values_before > 0])

# Step 5: Handling Numerical Missing Values with Mean
num_imputer = SimpleImputer(strategy="mean")
df[df.select_dtypes(include=['number']).columns] = num_imputer.fit_transform(df.select_dtypes(include=['number']))

# Step 6: Handling Categorical Missing Values with Mode
cat_imputer = SimpleImputer(strategy="most_frequent")
df[df.select_dtypes(include=['object']).columns] = cat_imputer.fit_transform(df.select_dtypes(include=['object']))

# Step 7: Verify no missing values remain
print("\nMissing values after imputation:")
print(df.isna().sum())

# Step 8: Handling Categorical Data (Encoding using Label Encoding)
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for inverse transform if needed

# Step 9: Identifying Outliers using IQR and Removing Them
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Step 10: Data Normalization and Standardization
# Min-Max Normalization
min_max_scaler = MinMaxScaler()
df_minmax = pd.DataFrame(min_max_scaler.fit_transform(df), columns=df.columns)

# Standardization (Z-score normalization)
standard_scaler = StandardScaler()
df_standard = pd.DataFrame(standard_scaler.fit_transform(df), columns=df.columns)

# Step 11: Display Processed Data
print("\nProcessed Adult Income Dataset (Min-Max Scaled):")
print(df_minmax.head())

print("\nProcessed Adult Income Dataset (Standard Scaled):")
print(df_standard.head())

# Step 12: Display categorical columns that were encoded
categorical_columns = df.select_dtypes(include=['int']).columns  # Categorical columns after encoding
print("\nCategorical columns encoded:")
print(categorical_columns)



Columns with missing values before imputation:
workclass         2799
occupation        2809
native-country     857
dtype: int64

Missing values after imputation:
age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

Processed Adult Income Dataset (Min-Max Scaled):
        age  workclass    fnlwgt  education  educational-num  marital-status  \
0  0.344262        0.0  0.188277   0.555556         0.363636        0.333333   
1  0.114754        0.0  0.881156   1.000000         0.454545        0.666667   
2  0.147541        0.0  0.169156   0.555556         0.363636        0.666667   
3  0.672131        0.0  0.708251   0.555556         0.363636        0.333333   
4  0.131148        0.0  0.475807   0.333333         0.7272