<a href="https://colab.research.google.com/github/AyeChanAung205/-business-data/blob/main/MS_MFU_DataPreparation_Part1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing and Cleaning
1.Handling Missing Values

> Missing data can bias results and reduce model performance.
> Methods to handle missing values:
*   Remove missing values (if they are too few).
*   Impute values (mean, median, mode, or advanced techniques).



In [None]:
import pandas as pd
import numpy as np

# Sample dataset with missing values
data = {'Age': [25, 30, np.nan, 35, 40, np.nan],
        'Salary': [50000, 60000, 70000, 80000, np.nan, 100000]}

df = pd.DataFrame(data)
print("Original Data:\n", df)

# Option 1: Remove rows with missing values
df_dropped = df.dropna()
print("\nData after dropping missing values:\n", df_dropped)

# Option 2: Fill missing values with mean
df_filled = df.fillna(df.mean(numeric_only=True))
print("\nData after filling missing values with mean:\n", df_filled)


Original Data:
     Age    Salary
0  25.0   50000.0
1  30.0   60000.0
2   NaN   70000.0
3  35.0   80000.0
4  40.0       NaN
5   NaN  100000.0

Data after dropping missing values:
     Age   Salary
0  25.0  50000.0
1  30.0  60000.0
3  35.0  80000.0

Data after filling missing values with mean:
     Age    Salary
0  25.0   50000.0
1  30.0   60000.0
2  32.5   70000.0
3  35.0   80000.0
4  40.0   72000.0
5  32.5  100000.0


2.Handling Duplicate Records
> Duplicate records can distort analysis and lead to incorrect conclusions.



In [None]:
# Sample dataset with duplicate records
df_dup = pd.DataFrame({'ID': [1, 2, 2, 3, 4, 4],
                        'Name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'David'],
                        'Salary': [50000, 60000, 60000, 70000, 80000, 80000]})

print("Original Data:\n", df_dup)

# Removing duplicate rows
df_no_duplicates = df_dup.drop_duplicates()
print("\nData after removing duplicates:\n", df_no_duplicates)


Original Data:
    ID     Name  Salary
0   1    Alice   50000
1   2      Bob   60000
2   2      Bob   60000
3   3  Charlie   70000
4   4    David   80000
5   4    David   80000

Data after removing duplicates:
    ID     Name  Salary
0   1    Alice   50000
1   2      Bob   60000
3   3  Charlie   70000
4   4    David   80000


3.Handling Noisy Data
> Data that contains errors, outliers, or inconsistencies.
> Methods to handle noisy data:
*   Smoothing techniques (moving averages, binning)
*   Removing outliers using statistical methods (Z-score, IQR)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Generating synthetic data with outliers
df_noisy = pd.DataFrame({'Value': [10, 12, 14, 13, 11, 120, 10, 9, 130, 12]})

# Identifying outliers using Interquartile Range (IQR)
Q1 = df_noisy['Value'].quantile(0.25)
Q3 = df_noisy['Value'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_cleaned = df_noisy[(df_noisy['Value'] >= lower_bound) & (df_noisy['Value'] <= upper_bound)]
print("Cleaned Data without outliers:\n", df_cleaned)

# Plotting before and after
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.boxplot(y=df_noisy['Value'])
plt.title("Before Cleaning")

plt.subplot(1, 2, 2)
sns.boxplot(y=df_cleaned['Value'])
plt.title("After Removing Outliers")
plt.show()


4.Converting Categorical Data to Numerical

> Machine learning models require numerical input.
> Methods:
*   Label Encoding (for ordinal categories)
*   One-Hot Encoding (for non-ordinal categories)

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Sample dataset with categorical values
df_cat = pd.DataFrame({'City': ['New York', 'London', 'Paris', 'New York', 'Paris']})

# Label Encoding
label_encoder = LabelEncoder()
df_cat['City_Label'] = label_encoder.fit_transform(df_cat['City'])

# One-Hot Encoding
df_onehot = pd.get_dummies(df_cat, columns=['City'])

print("Label Encoding:\n", df_cat)
print("\nOne-Hot Encoding:\n", df_onehot)


5.Normalization & Standardization
*   Normalization (Min-Max Scaling): Rescales values between 0 and 1.
*   Standardization (Z-Score Scaling): Centers data around mean 0 with standard deviation 1.


In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Sample dataset
df_scale = pd.DataFrame({'Age': [25, 30, 35, 40, 45],
                         'Salary': [50000, 60000, 70000, 80000, 90000]})

# Min-Max Scaling
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_scale), columns=df_scale.columns)

# Standardization
standard_scaler = StandardScaler()
df_standardized = pd.DataFrame(standard_scaler.fit_transform(df_scale), columns=df_scale.columns)

print("\nNormalized Data (Min-Max Scaling):\n", df_scaled)
print("\nStandardized Data (Z-Score):\n", df_standardized)



Normalized Data (Min-Max Scaling):
     Age  Salary
0  0.00    0.00
1  0.25    0.25
2  0.50    0.50
3  0.75    0.75
4  1.00    1.00

Standardized Data (Z-Score):
         Age    Salary
0 -1.414214 -1.414214
1 -0.707107 -0.707107
2  0.000000  0.000000
3  0.707107  0.707107
4  1.414214  1.414214


6.Feature Engineering
> Helps improve model performance by creating meaningful new features.
> Examples:
*   Polynomial Features
*   Log Transformation
*   Feature Interaction

In [None]:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# Sample dataset
df_features = pd.DataFrame({'x1': [1, 2, 3, 4, 5],
                            'x2': [2, 3, 4, 5, 6]})

# Generating Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
df_poly = pd.DataFrame(poly.fit_transform(df_features), columns=['x1', 'x2', 'x1^2', 'x1*x2', 'x2^2'])

print("\nPolynomial Features:\n", df_poly)