# Data Cleaning And Preprocessing Steps

## Cleaning STEPS

In [None]:
1. Reading a CSV file
import pandas as pd
df = pd.read_csv('filename.csv')

In [None]:
2. Checking the shape of the DataFrame
print(df.shape)

In [None]:
3. Checking the data types of the columns
print(df.dtypes)

In [None]:
4. Checking the number of missing values in each column
print(df.isnull().sum())

In [None]:
5. Dropping columns
df.drop(['column1', 'column2'], axis=1, inplace=True)

In [None]:
6. Renaming columns
df.rename(columns={'old_name': 'new_name'}, inplace=True)

In [None]:
7. Changing the data type of a column:
df['column'] = df['column'].astype('float')

In [None]:
8A. Handling missing data (dropping rows with missing values):
df.dropna(inplace=True)

In [None]:
8B. Handling missing data (imputing missing values with the median)
df.fillna(df.median(), inplace=True)

In [None]:
8C. Handling missing data (imputing missing values with the mean)
df.fillna(df.mean(), inplace=True)

In [None]:
8D. Handling missing data (imputing missing values with a constant)
df.fillna(0, inplace=True)

In [None]:
9. Handling Outliers

# Box Plot
import seaborn as sns
sns.boxplot(df_diabetics['bmi'])



In [None]:
9B:   Z-score

import numpy as np
outliers = []
def detect_outliers_zscore(data):
    thres = 3
    mean = np.mean(data)
    std = np.std(data)
    # print(mean, std)
    for i in data:
        z_score = (i-mean)/std
        if (np.abs(z_score) > thres):
            outliers.append(i)
    return outliers# 
sample_outliers = detect_outliers_zscore(sample)
for i in sample_outliers:
    a = np.delete(sample, np.where(sample==i))

In [None]:
9C. Inter Quantile Range(IQR)

outliers = []
def detect_outliers_iqr(data):
    data = sorted(data)
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    # print(q1, q3)
    IQR = q3-q1
    lwr_bound = q1-(1.5*IQR)
    upr_bound = q3+(1.5*IQR)
    # print(lwr_bound, upr_bound)
    for i in data: 
        if (i<lwr_bound or i>upr_bound):
            outliers.append(i)
    return outliers# Driver code
sample_outliers = detect_outliers_iqr(sample)
for i in sample_outliers:
    a = np.delete(sample, np.where(sample==i))

In [None]:
''' Detection '''
# IQR
# Calculate the upper and lower limits
Q1 = df_diabetes['bmi'].quantile(0.25)
Q3 = df_diabetes['bmi'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR
 
# Create arrays of Boolean values indicating the outlier rows
upper_array = np.where(df_diabetes['bmi']>=upper)[0]
lower_array = np.where(df_diabetes['bmi']<=lower)[0]
 
# Removing the outliers
df_diabetes.drop(index=upper_array, inplace=True)
df_diabetes.drop(index=lower_array, inplace=True)
 
# Print the new shape of the DataFrame
print("New Shape: ", df_diabetes.shape)

## Feature Preprocessing and transformation

## Dealing with categoricals

In [None]:
12. Handling categorical data (creating dummy variables)
df = pd.get_dummies(df, columns=['categorical_column'])

In [None]:
13. Handling categorical data (label encoding)
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['encoded_column'] = encoder.fit_transform(df['categorical_column'])

In [None]:
14. Handling numerical data (binning)
df['binned_column'] = pd.cut(
df['numerical_column'], 
bins=5, 
labels=['very_low', 'low', 'medium', 'high', 'very_high'])

In [None]:
15. Handling numerical data (scaling to a range)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['scaled_column'] = scaler.fit_transform(df[['numerical_column']])

In [None]:
16. Handling numerical data (standardization)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df['standardized_column'] = scaler.fit_transform(df[['numerical_column']])

In [None]:
17. Handling datetime data (converting to datetime format)
df['datetime_column'] = pd.to_datetime(df['datetime_column'])

In [None]:
18. Handling datetime data (extracting year)
df['year'] = df['datetime_column'].dt.year

In [None]:
19. Handling datetime data (extracting month)
df['month'] = df['datetime_column'].dt.month

In [None]:
20. Handling datetime data (extracting day)
df['day'] = df['datetime_column'].dt.day

## Feature Selection

In [None]:
21. Feature Selection
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

In [None]:
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

# Load the digits dataset
digits = load_digits()
X = digits.images.reshape((len(digits.images), -1))
y = digits.target

# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)
ranking = rfe.ranking_.reshape(digits.images[0].shape)