In [40]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

In [41]:
np.random.seed(0)
data = {
    'A': np.random.normal(0, 1, 1000),  # Normal distribution
    'B': np.random.exponential(1, 1000),  # Exponential distribution
    'C': np.random.uniform(0, 100, 1000),  # Uniform distribution
    'type': np.random.choice(['X', 'Y', 'Z'], size=1000),  # Categorical data
    'class': np.random.choice(['Alpha', 'Beta', 'Gamma'], size=1000),
    'D': np.random.randint(1, 100, size=1000)  # Integer range
}

df = pd.DataFrame(data)

# Create a new row with the desired values
new_data = {
    'A': [-100, -200],
    'B': np.random.exponential(1, 2),
    'C': np.random.uniform(0, 100, 2),
    'type': np.random.choice(['X', 'Y', 'Z'], size=2),
    'class': np.random.choice(['Alpha', 'Beta', 'Gamma'], size=2),
    'D': np.random.randint(1, 100, size=2)
}

new_df = pd.DataFrame(new_data)

# Concatenate the new DataFrame to the original DataFrame
df = pd.concat([df, new_df], ignore_index=True)

In [42]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values

A        0
B        0
C        0
type     0
class    0
D        0
dtype: int64

In [43]:
# Remove rows with missing values
df = df.dropna()

In [44]:
# Find outliers using z-score
z_scores = stats.zscore(df['A'])
outliers = df[(z_scores > 3) | (z_scores < -3)] # data point is more than 3 standard deviations above the mean.
outliers

Unnamed: 0,A,B,C,type,class,D
1000,-100.0,1.32977,85.434882,Y,Alpha,9
1001,-200.0,0.113921,53.17634,Z,Alpha,18


In [45]:
df.shape

(1002, 6)

Z_Score, z = (X - μ) / σ

In [46]:
# Remove outliers and create a new DataFrame
df_no_outliers = df[(z_scores <= 3) & (z_scores >= -3)]
df_no_outliers.shape

(1000, 6)

IQR

In [47]:
# Calculate the Interquartile Range (IQR) for column 'A'
Q1 = df['A'].quantile(0.25)
Q3 = df['A'].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers and create a new DataFrame
df_no_outliers = df[(df['A'] >= lower_bound) & (df['A'] <= upper_bound)]
df_no_outliers.shape

(993, 6)