In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from scipy import stats

# Load your dataset (uncomment and replace with your dataset path)
# df = pd.read_csv("C:\\Users\\SACHIN KUMAR\\OneDrive\\Desktop\\data analytics\\3rd\\20191226-items.csv")

# Sample dataset creation for demonstration
data = {
    'id': [1, 2, 2, 3, 4, 5, 6, None, 8, 9],
    'text': ['Good', 'Bad', 'Bad', None, 'Average', 'Excellent', 'Poor', 'Good', 'Bad', 'Average'],
    'score': [5, None, 3, 4, 2, 5, 1, 3, 4, 5],
    'category': ['A', 'B', 'B', 'C', 'D', 'A', None, 'A', 'B', 'C']
}
df = pd.DataFrame(data)

# 1. Handling Missing Data

# Display missing values before cleaning
print("Missing values before cleaning:\n", df.isnull().sum())

# Advanced imputation using SimpleImputer for numeric columns
numeric_imputer = SimpleImputer(strategy='mean')
df['score'] = numeric_imputer.fit_transform(df[['score']])

# Impute missing values for categorical column using the most frequent value
categorical_imputer = SimpleImputer(strategy='most_frequent')
df['category'] = categorical_imputer.fit_transform(df[['category']]).ravel()  # Use ravel() to flatten the array

# Drop rows where 'text' is missing
df.dropna(subset=['text'], inplace=True)

# 2. Duplicate Removal
initial_shape = df.shape
df.drop_duplicates(inplace=True)
print(f"Removed {initial_shape[0] - df.shape[0]} duplicates.")

# 3. Standardization
# Standardize text to lower case
df['text'] = df['text'].str.lower()

# Standardize categorical data: Encode categories to numerical values if necessary
df['category'] = pd.Categorical(df['category']).codes

# 4. Outlier Detection
# Using IQR method to detect outliers in 'score'
Q1 = df['score'].quantile(0.25)
Q3 = df['score'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
outlier_condition = (df['score'] < lower_bound) | (df['score'] > upper_bound)
df = df[~outlier_condition]

# Final cleaned data
print("\nCleaned Data:\n", df)

# Optionally, save the cleaned data to a new CSV file
# df.to_csv('cleaned_dataset.csv', index=False)


Missing values before cleaning:
 id          1
text        1
score       1
category    1
dtype: int64
Removed 0 duplicates.

Cleaned Data:
     id       text     score  category
0  1.0       good  5.000000         0
1  2.0        bad  3.555556         1
2  2.0        bad  3.000000         1
4  4.0    average  2.000000         3
5  5.0  excellent  5.000000         0
6  6.0       poor  1.000000        -1
7  NaN       good  3.000000         0
8  8.0        bad  4.000000         1
9  9.0    average  5.000000         2
