# Data Exploration and Preprocessing:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/content/adult_with_headers.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [5]:
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)
print(df.duplicated().sum())

24
0


In [6]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0
mean,38.585549,189780.8,10.081815,1078.443741,87.368227,40.440329
std,13.637984,105556.5,2.571633,7387.957424,403.101833,12.346889
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,236993.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [7]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Identify numerical features
numerical_features = df.select_dtypes(include=np.number).columns

# Apply Standard Scaling
scaler_standard = StandardScaler()
df_scaled_standard = df.copy()
df_scaled_standard[numerical_features] = scaler_standard.fit_transform(df[numerical_features])

print("DataFrame after Standard Scaling:")
print(df_scaled_standard.head())

# Apply Min-Max Scaling
scaler_minmax = MinMaxScaler()
df_scaled_minmax = df.copy()
df_scaled_minmax[numerical_features] = scaler_minmax.fit_transform(df[numerical_features])

print("\nDataFrame after Min-Max Scaling:")
print(df_scaled_minmax.head())


DataFrame after Standard Scaling:
        age          workclass    fnlwgt   education  education_num  \
0  0.030390          State-gov -1.063569   Bachelors       1.134777   
1  0.836973   Self-emp-not-inc -1.008668   Bachelors       1.134777   
2 -0.042936            Private  0.245040     HS-grad      -0.420679   
3  1.056950            Private  0.425752        11th      -1.198407   
4 -0.776193            Private  1.408066   Bachelors       1.134777   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  i

- Standard Scaling (Z-score normalization)
- Best for: Data that follows a normal distribution.
- How it works: Centers data around zero with a standard deviation of one.
- Why use it? Ideal for algorithms like logistic regression, SVM, and neural networks that assume normally distributed features.
- Sensitive to outliers? Yes, because extreme values affect the mean and standard deviation.


- Min-Max Scaling (Normalization)
- Best for: Data that does not follow a normal distribution.
- How it works: Rescales values to a fixed range, typically [0,1] or [-1,1].
- Why use it? Works well for decision trees, k-nearest neighbors, and SVM when feature ranges vary.
- Sensitive to outliers? Yes, because extreme values can distort scaling.


# 2. Encoding Techniques:

**•	Apply One-Hot Encoding to categorical variables with less than 5 categories. **

In [8]:
# prompt: •	Apply One-Hot Encoding to categorical variables with less than 5 categories. with d type int

# Identify categorical features
categorical_features = df.select_dtypes(include='object').columns

# Apply One-Hot Encoding to categorical variables with less than 5 categories
for col in categorical_features:
    if df[col].nunique() < 5:
        df = pd.get_dummies(df, columns=[col], prefix=col, dtype=int)

print("\nDataFrame after One-Hot Encoding:")
print(df.head())


DataFrame after One-Hot Encoding:
   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race  \
0        Never-married        Adm-clerical   Not-in-family   White   
1   Married-civ-spouse     Exec-managerial         Husband   White   
2             Divorced   Handlers-cleaners   Not-in-family   White   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black   
4   Married-civ-spouse      Prof-specialty            Wife   Black   

   capital_gain  capital_loss  hours_per_week  native_country  sex_ Female  \
0          2174             0              40   United-States            0   
1  

**•	Use Label Encoding for categorical variables with more than 5 categories**

In [9]:
# prompt: **•	Use Label Encoding for categorical variables with more than 5 categories**

from sklearn.preprocessing import LabelEncoder

# Apply Label Encoding to categorical variables with more than 5 categories
for col in categorical_features:
    # Check if the column still exists after one-hot encoding
    if col in df.columns and df[col].nunique() >= 5:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

print("\nDataFrame after Label Encoding:")
print(df.head())
print(df.info())


DataFrame after Label Encoding:
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  capital_gain  capital_loss  hours_per_week  \
0           1             1     4          2174             0              40   
1           4             0     4             0             0              13   
2           6             1     4             0             0              40   
3           6             0     2             0             0              40   
4          10             5     2             0             0              40   

   native_country  sex_ Female  sex_ Male  income_ <=

**•	Discuss the pros and cons of One-Hot Encoding and Label Encoding.**

Both **One-Hot Encoding** and **Label Encoding** are techniques for converting categorical data into numerical format, but they have distinct advantages and drawbacks.

### **One-Hot Encoding**
 **Pros:**  
- Works well for **nominal** categorical data (no inherent order).  
- Prevents models from assuming a false ordinal relationship between categories.  
- Ideal for algorithms like **logistic regression, neural networks, and KNN**.  

**Cons:**  
- Increases dimensionality, especially with high-cardinality features.  
- Can lead to the **curse of dimensionality**, making models slower and memory-intensive.  
- Requires careful handling to avoid the **dummy variable trap** (dropping one column to prevent multicollinearity).  

### **Label Encoding**
**Pros:**  
- Efficient for **ordinal** categorical data (where order matters).  
- Works well with **tree-based models** like Decision Trees and Random Forests.  
- Keeps feature space compact, avoiding excessive memory usage.  

**Cons:**  
- Can mislead models by introducing a **false ordinal relationship** in nominal data.  
- May cause bias in algorithms that interpret numerical values as continuous.  

# 3. Feature Engineering:

**•	Create at least 2 new features that could be beneficial for the model. Explain the rationale behind your choices.**

In [10]:
# Feature 1: Interaction between 'capital-gain' and 'capital-loss'
# Create a new feature that represents the net capital change.
df['net_capital_change'] = df['capital_gain'] - df['capital_loss']

# Feature 2: Polynomial feature for 'age'
# Create a new feature by squaring the 'age' column. This can capture non-linear relationships.
df['age_squared'] = df['age']**2

You're creating two new features in your dataset to enhance your model's ability to capture relationships within the data. Let's break them down:

### **Feature 1: Net Capital Change (`net-capital-change`)**
- This feature is calculated as the difference between `'capital-gain'` and `'capital-loss'`, representing the overall net capital change for each individual.
- Why is this useful? Instead of treating `'capital-gain'` and `'capital-loss'` as separate features, this transformation allows the model to learn the **combined financial impact**, which may be more meaningful in predicting income-related outcomes.

### **Feature 2: Polynomial Feature for Age (`age_squared`)**
- You're squaring the `'age'` column to introduce a **non-linear relationship** in the dataset.
- Why is this useful? Age may have a **non-linear effect** on income or other target variables. For example, income growth may plateau after a certain age, and squaring age helps capture such patterns.


**•	Apply a transformation (e.g., log transformation) to at least one skewed numerical feature and justify your choice.**

In [11]:
# Check skewness of numerical features
numerical_features = df.select_dtypes(include=np.number).columns
skewness = df[numerical_features].skew().sort_values(ascending=False)
print("\nSkewness of numerical features:")
print(skewness)

# Apply log transformation to 'capital-gain'
# Add a small constant (1) to avoid log(0)
df['capital_gain_log'] = np.log1p(df['capital_gain'])

# Check skewness after transformation
print("\nSkewness of 'capital-gain' after log transformation:", df['capital_gain_log'].skew())



Skewness of numerical features:
capital_gain          11.949403
net_capital_change    11.867499
capital_loss           4.592702
fnlwgt                 1.447703
age_squared            1.351414
income_ >50K           1.211687
relationship           0.786548
sex_ Female            0.719449
age                    0.557663
hours_per_week         0.228759
occupation             0.114586
marital_status        -0.012753
education_num         -0.309500
sex_ Male             -0.719449
workclass             -0.751598
education             -0.934160
income_ <=50K         -1.211687
race                  -2.435338
native_country        -3.661128
dtype: float64

Skewness of 'capital-gain' after log transformation: 3.094666793136126


if we check the skewness of the variables capital gain has highest skew so i used log transformation on it after transformation if we see the skew it reduced

# 4. Feature Selection:

**•	Use the Isolation Forest algorithm to identify and remove outliers. Discuss how outliers can affect model performance.**

In [12]:
from sklearn.ensemble import IsolationForest

# Identify numerical features for outlier detection
numerical_features_for_outliers = df.select_dtypes(include=np.number).columns

# Apply Isolation Forest to the numerical features
# contamination='auto' lets the algorithm decide the proportion of outliers
iso_forest = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
outlier_predictions = iso_forest.fit_predict(df[numerical_features_for_outliers])

# outlier_predictions will be 1 for inliers and -1 for outliers
# Filter out the outliers
df_no_outliers = df[outlier_predictions == 1].copy()

print(f"\nOriginal dataset size: {len(df)}")
print(f"Dataset size after removing outliers: {len(df_no_outliers)}")
print(f"Number of outliers removed: {len(df) - len(df_no_outliers)}")



Original dataset size: 32537
Dataset size after removing outliers: 27519
Number of outliers removed: 5018


Outliers are data points that are significantly different from other observations.
They can negatively impact model performance in several ways:

1. Sensitivity of Algorithms: Many algorithms, particularly those that rely on
   distance metrics (like K-Nearest Neighbors, K-Means) or assume normality
   (like Linear Regression, Logistic Regression), are highly sensitive to outliers.
   Outliers can skew the data distribution, affecting the mean, standard deviation,
   and variance, which in turn can distort the model's learning process.

2. Skewed Model Parameters: In linear models, outliers can pull the regression
   line towards them, leading to incorrect parameter estimates. This results in
   a model that doesn't accurately represent the underlying relationship in the
   majority of the data.

3. Increased Variance: Outliers can inflate the variance of the data, leading
   to models that are less stable and generalize poorly to new, unseen data.

4. Reduced Accuracy: Models trained on data with significant outliers may have
   lower accuracy, higher error rates (like Mean Squared Error), and poorer
   predictive performance, especially for predicting values near the typical range.

5. Difficulty in Convergence: Some optimization algorithms used in model training
   can struggle to converge when outliers are present.

Removing outliers can help to create a more stable, robust, and accurate model
by allowing the algorithm to learn the true patterns in the majority of the data
without being unduly influenced by extreme values. However, it's important to
investigate outliers before removing them, as they might represent valid but rare
observations or indicate data collection errors.

•	Apply the PPS (Predictive Power Score) to find and discuss the relationships between features. Compare its findings with the correlation matrix.

In [None]:
# prompt: •	Apply the PPS (Predictive Power Score) to find and discuss the relationships between features. Compare its findings with the correlation matrix.
'''
!pip install ppscore
import ppscore as pps

# Calculate the PPS matrix
pps_matrix = pps.matrix(df_no_outliers)

# Display the PPS matrix (showing only positive scores for clarity)
print("\nPredictive Power Score (PPS) Matrix:")
# Filter for relevant scores and pivot for better visualization
pps_pivot = pps_matrix.pivot(index='x', columns='y', values='ppscore')
# Optional: Use a heatmap for better visualization
plt.figure(figsize=(12, 10))
sns.heatmap(pps_pivot, annot=True, cmap="Blues", fmt=".2f")
plt.title("Predictive Power Score (PPS) Heatmap")
plt.show()

# Calculate the Correlation matrix
correlation_matrix = df_no_outliers.corr(numeric_only=True)

# Display the Correlation matrix
print("\nCorrelation Matrix:")
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()
'''

**the above code is used to check pps but due to verson difference it was not running i tried multiple ways like installing old versons of pandas etc but failed to get it please give some advice**