In [1]:
#Q1.

In [2]:
# Missing values in a dataset:

# Are absent or undefined data points for one or more variables or observations.
# Can compromise data integrity
# Reduce statistical power and bias results.
# Challenge ML algorithms that cannot handle missing values, leading to errors during model training and evaluation.
# Require proper handling.

# Algorithms unaffected by missing values:

# Decision Trees
# Random Forest
# K-Nearest Neighbors
# Naive Bayes
# XGBoost

In [3]:
# Q2.

# Handling missing data is crucial in data preprocessing. Here are some common techniques and Python examples for each:

In [8]:
# Deletion - Removing rows or columns with missing values.
import numpy as np
import pandas as pd

In [9]:
data = {'A':[1,2,np.nan,4], 'B':[5,np.nan,7,8]}
df = pd.DataFrame(data)

In [10]:
df_dropped = df.dropna()
print(df_dropped)

     A    B
0  1.0  5.0
3  4.0  8.0


In [11]:
# Imputation - Replacing missing values with statistical measures.

from sklearn.impute import SimpleImputer

In [12]:
data = {'A':[1,2,np.nan,4], 'B':[5,np.nan,7,8]}
df = pd.DataFrame(data)

In [15]:
imputer = SimpleImputer(strategy='mean')
df_imputed = imputer.fit_transform(df)
df_imputed = pd.DataFrame(df_imputed,columns=df.columns)
print(df_imputed)

          A         B
0  1.000000  5.000000
1  2.000000  6.666667
2  2.333333  7.000000
3  4.000000  8.000000


In [16]:
# Forward Fill and Backward Fill - Propagating the last valid observation forward or using the next valid observation backward.

In [17]:
data = {'A':[1,2,np.nan,4], 'B':[5,np.nan,7,8]}
df = pd.DataFrame(data)

In [18]:
df_ffill = df.fillna(method='ffill')
df_bfill = df.fillna(method='bfill')

In [19]:
print(df_ffill)
print(df_bfill)

     A    B
0  1.0  5.0
1  2.0  5.0
2  2.0  7.0
3  4.0  8.0
     A    B
0  1.0  5.0
1  2.0  7.0
2  4.0  7.0
3  4.0  8.0


In [20]:
# Interpolation - Estimating missing values based on existing data points.

data = {'A':[1,2,np.nan,4], 'B':[5,np.nan,7,8]}
df = pd.DataFrame(data)

In [21]:
df_interpolated = df.interpolate()
print(df_interpolated)

     A    B
0  1.0  5.0
1  2.0  6.0
2  3.0  7.0
3  4.0  8.0


In [22]:
# Q3.

In [23]:
# Imbalanced Data: Refers to unequal class distribution in a classification problem.
# Bias in Model: Imbalanced data leads to model bias towards the majority class.
# Misclassification: Minority classes suffer higher misclassification rates.
# Ineffective Learning: Models struggle to learn patterns in minority classes.
# Loss of information: Valuable information in minority classes is neglected.
# Unfair Model: Can result in unfair or biased predications in applications like fraud detection or medical diagnosis.
# Poor Generalization: Models may have trouble generalizing to new data.

In [24]:
#Q4.

# Up-sampling: Increasing the number of instances in the minority class by duplicating or creating synthetic examples.

# Down-sampling: Reducing the number of instances in the majority class by randomly removing examples.

In [25]:
# Up-sampling Example:

from sklearn.utils import resample

In [26]:
data = pd.DataFrame({
    
    'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'target': [0, 1, 0, 0, 1, 0, 0, 1, 0]
})

minority_class = data[data['target'] == 1]
majority_class = data[data['target'] == 0]

minority_upsampled = resample(minority_class, replace=True,n_samples=len(majority_class),random_state=42)

balanced_data = pd.concat([majority_class, minority_upsampled])
print(balanced_data)

   feature1  target
0         1       0
2         3       0
3         4       0
5         6       0
6         7       0
8         9       0
7         8       1
1         2       1
7         8       1
7         8       1
1         2       1
1         2       1


In [27]:
# Down-sampling Example:

data = pd.DataFrame({
    'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'target': [0, 1, 0, 0, 1, 0, 0, 1, 0]
})

# Separate minority and majority classes
minority_class = data[data['target'] == 1]
majority_class = data[data['target'] == 0]

# Down-sample the majority class
majority_downsampled = resample(majority_class, replace=False, n_samples=len(minority_class), random_state=42)

# Combine the down-sampled majority class with the minority class
balanced_data = pd.concat([minority_class, majority_downsampled])

print(balanced_data)

   feature1  target
1         2       1
4         5       1
7         8       1
0         1       0
2         3       0
8         9       0


In [28]:
# Q5.

# Data Augmentation: Technique to expand a dataset by applying transformations to existing data.

# SMOTE: Synthetically generates samples for minority class in imbalanced datasets.

# Purpose: Balancing class distribution in classification problems.

# How It Works: Interpolates between minority class samples and their nearest neighbors.

# Example: Balancing 100 minority samples with 900 majority samples using a 1:1 ratio.

# Benefits: Mitigates class imbalance, reduces overfitting to majority class, improves generalization for minority class.

# Considerations: Use with caution, as agressive oversampling can introduce noise.

In [29]:
#Q6.

# Outliers: Data points significantly different from other obaservations.
# Importance of Handling:
# Affect statistical measures and assumptions.
# Distort model accuracy and generalization.
# May indicate errors or rare events.

# Methods to Handle:
# Identify and remove outliers.
# Transform data to be less sensitive to outliers.
# Use robust statistical techniques.
# Treat outliers as a separate class

# Impact on Analysis:
# Outliers can skew results, lead to incorrect conclusions, and affect model performance.

In [30]:
#Q7.
# Here are the techniques to handle missing data in customer data analysis with concise explanations:
# Data Imputation: Fill missing values with mean, median, or mode.
# Forward or Backward Fill: Use neighboring values for time-series data.
# Interpolation: Estimate missing values based on data trends.
# Multiple Imputation: Generate and analyze multiple imputed datasets.
# Deletion: Listwise: Remove rows with any missing values.; Pairwise: Analyze data as-is, ignoring missing values for specific calculations.
# Predictive Modeling: Use ML to predict missing values.
# Domain-Specific Methods: Apply industry knowledge.
# Missing Data Indicator: Add a binary column to flag missing data.

In [31]:
#Q8.

# Visual Inspection: Use data visualizations like heatmaps to spot patterns in missing data.
# Statistical Testing: Apply statistical tests (e.g., chi-square, t-tests) to identify relationships with other variables.
# Domain Expertise: Seek insights from domain experts who may understand the reasons behind missing data.
# Exploratory Data Analysis (EDA): Conduct comprehensive EDA to uncover potential patterns or trends.
# Imputation Methods: Test various imputation techniques, observing which ones work best, indicating data patterns.
# Machine Learning Models: Train models to predict missing values; good performance suggests patterns.
# Time-Series Consideration: Analyze time-related or event-based patterns in time-series data.
# Surveys or Interviews: Gather additional information or insights from data sources or surveys to explain missing data.
# Correlation Analysis: Investigate correlations between variables with missing data and others; strong correlations may reveal patterns.
# Missing Data Indicators: Create binary indicators (1 for missing, 0 for not) and study their relationships with other variables.

In [33]:
#Q9.

# Here are a few effective strategies to evaluate the performance of a machine learning model on an imbalanced dataset in a medical diagnosis project:

# Ensemble Models: Employ ensemble methods like Random Forests or Gradient Boosting, which handle class imbalance well.

# Resampling: Either oversample the minority class (up-sampling) or undersample the majority class (down-sampling) to balance the class distribution.

# Synthetic Data Generation: Use techniques like Synthetic Minority Over-sampling Technique (SMOTE) to create synthetic samples of the minority class.

# Anomaly Detection: Treat the minority class as anomalies and apply anomaly detection algorithms.

# Cost-sensitive Learning: Adjust the misclassification costs to penalize errors on the minority class more heavily.

# Cross-Validation: Perform cross-validation with stratified sampling to ensure representative splits during training and evaluation.

# Algorithm Selection: Choose algorithms that are less sensitive to class imbalance, such as Support Vector Machines (SVM) with class weights.

# Collect More Data: If possible, collect more data for the minority class to balance the dataset.

# Domain Knowledge: Leverage domain knowledge to inform feature engineering or model selection.

In [34]:
#Q10.

# To balance an unbalanced dataset with a majority class (e.g., satisfied customers) and a minority class (e.g., unsatisfied customers) through down-sampling, you can employ several methods:

# Random Down-Sampling: Randomly select a subset of the majority class samples to match the size of the minority class.

# Cluster-Based Down-Sampling: Use clustering techniques to group similar samples from the majority class, then randomly select one sample from each cluster.

# Edited Nearest Neighbors (ENN): Remove majority class samples whose class label differs from most of their k-nearest neighbors.

# NearMiss: Select majority class samples that are closest to the minority class samples based on specific distance metrics.

# CNN (Condensed Nearest Neighbors): Iteratively identify and remove redundant majority class samples while preserving the boundary information.

In [None]:
#Q11.

