In [3]:
# Step 1: Import the required libraries
import pandas as pd
import numpy as np

# Step 2: Create the data
data = pd.DataFrame({'Age': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 50]})

In [5]:
# Step 3: Calculate the mean and standard deviation
mean = np.mean(data['Age'])
std = np.std(data['Age'])

# Step 4: Calculate the Z-Score
data['Z-Score'] = (data['Age'] - mean) / std
data

Unnamed: 0,Age,Z-Score
0,20,-0.938954
1,21,-0.806396
2,22,-0.673838
3,23,-0.54128
4,24,-0.408721
5,25,-0.276163
6,26,-0.143605
7,27,-0.011047
8,28,0.121512
9,29,0.25407


In [6]:
# Step 5: Print the data
print("----------------------------------------")
print(f"Here is the data with outliers:\n {data}")
print("----------------------------------------")


----------------------------------------
Here is the data with outliers:
     Age   Z-Score
0    20 -0.938954
1    21 -0.806396
2    22 -0.673838
3    23 -0.541280
4    24 -0.408721
5    25 -0.276163
6    26 -0.143605
7    27 -0.011047
8    28  0.121512
9    29  0.254070
10   30  0.386628
11   50  3.037793
----------------------------------------


In [7]:
# Step 6: Print the outliers
print(f"Here are the outliers based on the z-score threshold, 3:\n {data[data['Z-Score'] > 3]}")
print("----------------------------------------")


Here are the outliers based on the z-score threshold, 3:
     Age   Z-Score
11   50  3.037793
----------------------------------------


In [8]:
# Step 7: Remove the outliers
data = data[data['Z-Score'] <= 3]

# Step 8: Print the data without outliers
print(f"Here is the data without outliers:\n {data}")

Here is the data without outliers:
     Age   Z-Score
0    20 -0.938954
1    21 -0.806396
2    22 -0.673838
3    23 -0.541280
4    24 -0.408721
5    25 -0.276163
6    26 -0.143605
7    27 -0.011047
8    28  0.121512
9    29  0.254070
10   30  0.386628


In [9]:
# Import libraries
import numpy as np
from scipy import stats

# Sample data
data = [2.5, 2.7, 2.8, 3.0, 3.2, 3.4, 3.6, 3.8, 4.0, 110.0]

# Calculate the Z-score for each data point
z_scores = np.abs(stats.zscore(data))

# Set a threshold for identifying outliers
threshold = 2.5 
outliers = np.where(z_scores > threshold)[0]

# print the data
print("----------------------------------------")
print("Data:", data)
print("----------------------------------------")

print("Indices of Outliers:", outliers)
print("Outliers:", [data[i] for i in outliers])

# Remove outliers
data = [data[i] for i in range(len(data)) if i not in outliers]
print("----------------------------------------")
print("Data without outliers:", data)

----------------------------------------
Data: [2.5, 2.7, 2.8, 3.0, 3.2, 3.4, 3.6, 3.8, 4.0, 110.0]
----------------------------------------
Indices of Outliers: [9]
Outliers: [110.0]
----------------------------------------
Data without outliers: [2.5, 2.7, 2.8, 3.0, 3.2, 3.4, 3.6, 3.8, 4.0]


In [10]:
# Step 1: Import the required libraries
import pandas as pd
import numpy as np

# Step 2: Create the data
data = pd.DataFrame({'Age': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 50]})

# Step 3: Calculate the first and third quartile
Q1 = np.percentile(data['Age'], 25, interpolation = 'midpoint')
Q3 = np.percentile(data['Age'], 75, interpolation = 'midpoint')

# Step 4: Calculate the IQR
IQR = Q3 - Q1

# Step 5: Calculate the lower and upper bound
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

# Step 6: Print the data
print("----------------------------------------")
print(f"Here is the data with outliers:\n {data}")
print("----------------------------------------")
# Step 7: Print the outliers
print(f"Here are the outliers based on the IQR threshold:\n {data[(data['Age'] < lower_bound) | (data['Age'] > upper_bound)]}")
print("----------------------------------------")
# Step 8: Remove the outliers
data = data[(data['Age'] >= lower_bound) & (data['Age'] <= upper_bound)]

# Step 9: Print the data without outliers
print(f"Here is the data without outliers:\n {data}")

----------------------------------------
Here is the data with outliers:
     Age
0    20
1    21
2    22
3    23
4    24
5    25
6    26
7    27
8    28
9    29
10   30
11   50
----------------------------------------
Here are the outliers based on the IQR threshold:
     Age
11   50
----------------------------------------
Here is the data without outliers:
     Age
0    20
1    21
2    22
3    23
4    24
5    25
6    26
7    27
8    28
9    29
10   30


In [11]:
# Import library
from sklearn.cluster import KMeans

# Sample data
data = [[2, 2], [3, 3], [3, 4], [30, 30], [31, 31], [32, 32]]

# Create a K-means model with two clusters (normal and outlier)
kmeans = KMeans(n_clusters=2, n_init=10)
kmeans.fit(data)

# Predict cluster labels
labels = kmeans.predict(data)

# Identify outliers based on cluster labels
outliers = [data[i] for i, label in enumerate(labels) if label == 1]

# print data
print("Data:", data)
print("Outliers:", outliers)
# Remove outliers
data = [data[i] for i, label in enumerate(labels) if label == 0]
print("Data without outliers:", data)

Data: [[2, 2], [3, 3], [3, 4], [30, 30], [31, 31], [32, 32]]
Outliers: [[30, 30], [31, 31], [32, 32]]
Data without outliers: [[2, 2], [3, 3], [3, 4]]


### What are Outliers?

Outliers are data points that significantly differ from other observations in a dataset. They can either be extremely high or extremely low values that don't fit into the general pattern of the data. Outliers can distort statistical analyses, affect machine learning models, and lead to inaccurate conclusions if not treated properly.

In practice, outliers are identified based on different rules, such as:
- **Statistical distance**: Data points that fall outside a certain range of the majority of the data (e.g., values greater than 3 standard deviations from the mean).
- **Visual analysis**: Outliers are often easy to spot in plots like box plots or scatter plots.
- **Domain knowledge**: Sometimes, a value may seem extreme statistically, but based on domain knowledge, it is valid (or vice versa).

### Types of Outliers

1. **Univariate Outliers**: 
   - Outliers identified by looking at a single variable. For example, if most sales are between 100 and 500, and suddenly there's a value of 5000, it’s likely an outlier in that particular column.
   
2. **Multivariate Outliers**: 
   - These outliers are detected when looking at multiple variables together. A combination of values that doesn’t fit the overall pattern across multiple variables can indicate a multivariate outlier. For example, a customer buying 100 laptops may be usual if that customer is a company, but if they are an individual consumer, this could be a multivariate outlier.

3. **Global vs. Contextual Outliers**:
   - **Global Outliers**: Data points that are extreme compared to the entire dataset.
   - **Contextual Outliers**: Data points that may seem normal in some contexts but are unusual in a specific subset. For example, a temperature of 35°C is normal in summer but could be an outlier in winter.

### Causes of Outliers

1. **Measurement Error**: The value is incorrect due to mistakes in data collection, entry, or sensor errors (e.g., inputting a height of 300 cm instead of 30 cm).
   
2. **Natural Variation**: Some outliers may occur due to the natural variability in the system (e.g., extreme weather conditions, rare events).
   
3. **Sampling Error**: Sometimes, outliers occur because the sample selected isn't representative of the overall population, leading to some extreme values.

4. **Human Error**: Mistakes during data entry or transcription (e.g., typing an extra zero or putting a decimal in the wrong place).

### Detecting Outliers

#### 1. **Visual Methods**:
   - **Box Plots**: A box plot visualizes the distribution of data and marks any points that fall outside the "whiskers" as potential outliers.
   - **Scatter Plots**: Scatter plots can show outliers, especially when comparing two variables.
   - **Histograms**: Outliers can sometimes be visible in histograms as isolated bars away from the rest of the data.
   
   Example using a box plot:
   ```python
   import pandas as pd
   import matplotlib.pyplot as plt

   # Sample data
   data = {'values': [10, 12, 14, 15, 16, 17, 100]}
   df = pd.DataFrame(data)

   # Create a boxplot
   plt.boxplot(df['values'])
   plt.show()
   ```

#### 2. **Statistical Methods**:
   - **Z-Score**: Measures how far away a data point is from the mean, in terms of standard deviations.
     - Any value with a Z-score greater than 3 or less than -3 is often considered an outlier.
   
   ```python
   from scipy import stats
   import numpy as np

   # Calculate Z-scores
   z = np.abs(stats.zscore(df['values']))

   # Identify outliers (z > 3)
   print(df[z > 3])
   ```

   - **IQR (Interquartile Range)**: Based on the range between the 1st quartile (25th percentile) and the 3rd quartile (75th percentile). Any point outside 1.5 times the IQR from these quartiles is considered an outlier.

   ```python
   Q1 = df['values'].quantile(0.25)
   Q3 = df['values'].quantile(0.75)
   IQR = Q3 - Q1

   # Define outlier limits
   lower_bound = Q1 - 1.5 * IQR
   upper_bound = Q3 + 1.5 * IQR

   # Identify outliers
   outliers = df[(df['values'] < lower_bound) | (df['values'] > upper_bound)]
   print(outliers)
   ```

#### 3. **Machine Learning Methods**:
   - **Isolation Forest**: An unsupervised learning algorithm that identifies outliers by randomly isolating data points.
   
   - **DBSCAN (Density-Based Spatial Clustering of Applications with Noise)**: A clustering method that can find points that don’t belong to any dense cluster (outliers).
   
   - **Autoencoders**: A neural network approach that can be trained to learn normal data patterns and identify deviations (outliers).

### Handling Outliers

1. **Remove Outliers**: 
   - If the outliers are due to errors or are irrelevant, you can remove them from the dataset. Be cautious, as removing legitimate data could bias your results.

   ```python
   # Remove outliers based on IQR
   df_clean = df[(df['values'] >= lower_bound) & (df['values'] <= upper_bound)]
   ```

2. **Cap/Trim Outliers**: 
   - Limit outliers to a fixed boundary. For example, any value above or below a certain threshold can be set to that threshold (also known as winsorization).

   ```python
   # Cap values at the lower and upper bound
   df['values'] = np.where(df['values'] < lower_bound, lower_bound, df['values'])
   df['values'] = np.where(df['values'] > upper_bound, upper_bound, df['values'])
   ```

3. **Impute with Statistical Values**: 
   - Replace outliers with more reasonable values, like the mean or median of the rest of the data.

   ```python
   # Replace outliers with median
   median_value = df['values'].median()
   df['values'] = np.where(z > 3, median_value, df['values'])
   ```

4. **Use Robust Models**: 
   - Some machine learning models, like tree-based models (e.g., Random Forests, XGBoost), are more robust to outliers than others like linear regression.

5. **Transform Data**: 
   - Apply transformations like log, square root, or box-cox to reduce the effect of outliers by reducing the variability in the data.

   ```python
   df['log_values'] = np.log(df['values'] + 1)
   ```

### When to Handle Outliers?

- **Model Sensitivity**: If your model is highly sensitive to extreme values (e.g., linear regression, k-means), you should address outliers.
  
- **Data Integrity**: If outliers reflect measurement errors or invalid data, removing them is justified. However, if outliers represent real but rare events (like fraudulent transactions), they may be valuable to retain.

- **Business Requirements**: Depending on the problem you're solving, outliers may or may not be relevant. In financial analysis, for instance, large deviations may be meaningful and shouldn’t be removed.

### Summary

Outliers can have a significant impact on data analysis and machine learning models. It’s important to understand why outliers exist, how to detect them, and how to handle them in a way that maintains the integrity of your analysis. The methods you choose depend on the context of the data, the problem you're solving, and the sensitivity of your model.