In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

In [2]:
np.random.seed(0)
data = {
    'A': np.random.normal(0, 1, 1000),  # Normal distribution
    'B': np.random.exponential(1, 1000),  # Exponential distribution
    'C': np.random.uniform(0, 100, 1000),  # Uniform distribution
    'type': np.random.choice(['X', 'Y', 'Z'], size=1000),  # Categorical data
    'class': np.random.choice(['Alpha', 'Beta', 'Gamma'], size=1000),
    'D': np.random.randint(1, 100, size=1000)  # Integer range
}

df = pd.DataFrame(data)

# Create a new row with the desired values, including NaN values
new_data = {
    'A': [-100, -200, np.nan, np.nan],
    'B': np.random.exponential(1, 4),
    'C': np.random.uniform(0, 100, 4),
    'type': np.random.choice(['X', 'Y', 'Z'], size=4),
    'class': np.random.choice(['Alpha', 'Beta', 'Gamma'], size=4),
    'D': np.random.randint(1, 100, size=4)
}

new_df = pd.DataFrame(new_data)

# Concatenate the new DataFrame to the original DataFrame
df = pd.concat([df, new_df], ignore_index=True)

In [3]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values

A        2
B        0
C        0
type     0
class    0
D        0
dtype: int64

In [4]:
# Remove rows with missing values
df = df.dropna()

In [5]:
df

Unnamed: 0,A,B,C,type,class,D
0,1.764052,1.725432,98.097940,X,Gamma,72
1,0.400157,1.205736,24.584930,X,Beta,58
2,0.978738,2.146245,71.050533,Z,Gamma,76
3,2.240893,3.398454,50.511344,Y,Gamma,58
4,1.867558,1.490534,47.877264,Z,Beta,11
...,...,...,...,...,...,...
997,0.094192,1.789692,16.035315,Y,Beta,10
998,-1.147611,0.831483,96.410094,Z,Alpha,25
999,-0.358114,0.095376,45.125196,Y,Beta,73
1000,-100.000000,1.329770,61.668505,Y,Alpha,80


In [6]:
df.describe

<bound method NDFrame.describe of                A         B          C type  class   D
0       1.764052  1.725432  98.097940    X  Gamma  72
1       0.400157  1.205736  24.584930    X   Beta  58
2       0.978738  2.146245  71.050533    Z  Gamma  76
3       2.240893  3.398454  50.511344    Y  Gamma  58
4       1.867558  1.490534  47.877264    Z   Beta  11
...          ...       ...        ...  ...    ...  ..
997     0.094192  1.789692  16.035315    Y   Beta  10
998    -1.147611  0.831483  96.410094    Z  Alpha  25
999    -0.358114  0.095376  45.125196    Y   Beta  73
1000 -100.000000  1.329770  61.668505    Y  Alpha  80
1001 -200.000000  0.113921  82.885771    Y  Gamma   8

[1002 rows x 6 columns]>

In [7]:
# Validate data types
df['Y'] = pd.to_numeric(df['B'], errors='coerce')

# Remove duplicates
df = df.drop_duplicates()

# Summary of the cleaned data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1002 entries, 0 to 1001
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       1002 non-null   float64
 1   B       1002 non-null   float64
 2   C       1002 non-null   float64
 3   type    1002 non-null   object 
 4   class   1002 non-null   object 
 5   D       1002 non-null   int32  
 6   Y       1002 non-null   float64
dtypes: float64(4), int32(1), object(2)
memory usage: 58.7+ KB
None


In [8]:
# Find outliers using z-score
z_scores = stats.zscore(df['A'])
outliers = df[(z_scores > 3) | (z_scores < -3)] # data point is more than 3 standard deviations above the mean.
outliers

Unnamed: 0,A,B,C,type,class,D,Y
1000,-100.0,1.32977,61.668505,Y,Alpha,80,1.32977
1001,-200.0,0.113921,82.885771,Y,Gamma,8,0.113921


In [9]:
df.shape

(1002, 7)

Z_Score, z = (X - μ) / σ

In [10]:
# Remove outliers and create a new DataFrame
df_no_outliers = df[(z_scores <= 3) & (z_scores >= -3)]
df_no_outliers.shape

(1000, 7)

IQR

In [11]:
# Calculate the Interquartile Range (IQR) for column 'A'
Q1 = df['A'].quantile(0.25)
Q3 = df['A'].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers and create a new DataFrame
df_no_outliers = df[(df['A'] >= lower_bound) & (df['A'] <= upper_bound)]
df_no_outliers.shape

(993, 7)