## Daily Challenge: Web Scraping And Data Visualization

In [17]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import zscore
import numpy as np

In [7]:
column_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                "hours-per-week", "native-country", "income"]

In [8]:
df = pd.read_csv(r'/content/adult.data',names=column_names, na_values=" ?", skipinitialspace=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
# Initial exploration
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


In [10]:
# Data Normalization
numerical_features = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]

In [13]:
# Min-Max scaling
scaler_minmax = MinMaxScaler()
df[numerical_features] = scaler_minmax.fit_transform(df[numerical_features])


In [14]:
# Z-score normalization
scaler_zscore = StandardScaler()
df[numerical_features] = scaler_zscore.fit_transform(df[numerical_features])

# Changes in data distribution post-normalization
print(df.describe())

                age        fnlwgt  education-num  capital-gain  capital-loss  \
count  3.256100e+04  3.256100e+04   3.256100e+04  3.256100e+04  3.256100e+04   
mean  -1.309314e-16 -1.330045e-16   3.447859e-17 -7.201225e-18 -6.655678e-17   
std    1.000015e+00  1.000015e+00   1.000015e+00  1.000015e+00  1.000015e+00   
min   -1.582206e+00 -1.681631e+00  -3.529656e+00 -1.459205e-01 -2.166595e-01   
25%   -7.757679e-01 -6.816910e-01  -4.200596e-01 -1.459205e-01 -2.166595e-01   
50%   -1.159546e-01 -1.082193e-01  -3.136003e-02 -1.459205e-01 -2.166595e-01   
75%    6.904838e-01  4.478765e-01   7.460392e-01 -1.459205e-01 -2.166595e-01   
max    3.769612e+00  1.226856e+01   2.300838e+00  1.339458e+01  1.059351e+01   

       hours-per-week  
count    3.256100e+04  
mean    -6.142864e-16  
std      1.000015e+00  
min     -3.194030e+00  
25%     -3.542945e-02  
50%     -3.542945e-02  
75%      3.695194e-01  
max      4.742967e+00  


In [15]:
# Handling Outliers using Z-score
z_scores = zscore(df[numerical_features])
outliers = (z_scores > 3) | (z_scores < -3)

In [22]:
# Apply log transformation to handle outliers
# Log transformation
for feature in numerical_features:
    if np.any(outliers[feature]):
        df[feature] = np.log1p(df[feature])

df[feature]

  result = getattr(ufunc, method)(*inputs, **kwargs)


0       -0.038149
1             NaN
2       -0.038149
3       -0.038149
4       -0.038149
           ...   
32556   -0.336143
32557   -0.038149
32558   -0.038149
32559         NaN
32560   -0.038149
Name: hours-per-week, Length: 32561, dtype: float64