# PreProcessing 

In [None]:
## LOADING THE LIB

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as se
from sklearn.impute import SimpleImputer

In [None]:
## LOADING THE DATA SET
data = pd.read_csv("dataset.csv") 

In [None]:
##  Check Basic Information

data.info()  # Check data types and missing values
data.head()  # View the first few rows
data.tail()  # View the last few rows
data.shape  # Get the number of rows and columns
data.columns  # List column names

In [None]:
## CHECK FOR MISSING VALUES
data.isnull().sum()  # Count missing values in each column
data.isna().sum()  # Alternative way to check missing values

### HANDLING MISSING VALUES

In [None]:

## Dropping the whole column

def drop_column(df, column_name):

    df = df.drop(columns=[column_name])
    return df


# Function 1: dDrop rows (CCA)

def complete_case_analysis(df):
    """
    Removes all rows with missing values (Complete Case Analysis).
    """
    return df.dropna()



# Function 2: Arbitrary Imputation

def arbitrary_imputation(df, column, value):
    """
    Replaces missing values in a column with an arbitrary value.
    """
    df[column].fillna(value, inplace=True)
    return df


# Function 3: End of Distribution Imputation

def end_of_distribution_imputation(df, column, factor=3):
    """
    Replaces missing values with extreme values from the distribution.
    - Uses Mean + factor*Std for Normal distributions
    - Uses Q3 + factor*IQR for Skewed distributions
    """
    if df[column].dtype in ['int64', 'float64']:  
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        extreme_value = Q3 + factor * IQR  # Setting extreme value at Q3 + 3*IQR
        df[column].fillna(extreme_value, inplace=True)
    return df



# Function 4: Random Sample Imputation
def random_sample_imputation(df, column):
    """
    Replaces missing values in a column with randomly sampled values from the same column.
    """
    column_values = data[column].dropna()
    random_sample = np.random.choice(column_values, size=data['bedrooms'].isnull().sum(),replace=True)
    data.loc[data[column].isnull(), column] = random_sample
    return df



# Function 5: Simple Imputer (Mean, Median, Mode)
def simple_imputer(df, column):
    """
    Prompts user to choose mean, median, or mode for imputation.
    """
    print("Choose imputation method: [1] Mean, [2] Median, [3] Mode")
    choice = input("Enter choice (1/2/3): ")
    
    if choice == '1':
        imputer = SimpleImputer(strategy="mean")
    elif choice == '2':
        imputer = SimpleImputer(strategy="median")
    elif choice == '3':
        imputer = SimpleImputer(strategy="most_frequent")
    else:
        print("Invalid choice. Defaulting to mean.")
        imputer = SimpleImputer(strategy="mean")
    
    df[column] = imputer.fit_transform(df[[column]])
    return df

########################################################
########################################################



#### Outlier

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x=df["numerical_column"])
plt.show()


In [None]:
Q1 = data["num_col"].quantile(0.25)
Q3 = data["num_col"].quantile(0.75)
IQR = Q3 - Q1

data = data[~((data["num_col"] < (Q1 - 1.5 * IQR)) | (data["num_col"] > (Q3 + 1.5 * IQR)))]


### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data["numerical_column"] = scaler.fit_transform(data[["numerical_column"]])


### Encoding

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

# Create sample dataset
data = {
    "Product": ["Laptop", "Smartphone", "Tablet", "Laptop", "Smartphone", "Tablet", "Headphones", "Headphones"],
    "Brand": ["Dell", "Apple", "Samsung", "HP", "Apple", "Samsung", "Sony", "Bose"],
    "Category": ["Electronics", "Electronics", "Electronics", "Electronics", "Electronics", "Electronics", "Audio", "Audio"],
    "Price_Range": ["High", "High", "Medium", "Medium", "High", "Medium", "Low", "Medium"],
    "Sales": [500, 600, 300, 400, 650, 320, 200, 280]  # Target Variable
}
df = pd.DataFrame(data)

# One-Hot Encoding (OHE)
df_ohe = pd.get_dummies(df, columns=["Product", "Brand", "Category", "Price_Range"], drop_first=True)

# Label Encoding
label_enc = LabelEncoder()
df["Product_Label"] = label_enc.fit_transform(df["Product"])
df["Brand_Label"] = label_enc.fit_transform(df["Brand"])

# Ordinal Encoding (for ordered categories)
ordinal_enc = OrdinalEncoder(categories=[["Low", "Medium", "High"]])
df["Price_Ordinal"] = ordinal_enc.fit_transform(df[["Price_Range"]])

# Display results
df_encoded = pd.concat([df, df_ohe, df_binary], axis=1)
print(df_encoded.head())
