# Feature Engineering 2

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

We'll continue to explore:
- Feature Extraction
- Data Scaling
- Additional Data Transformation

## Feature Extraction

- Data Binning
- Data Grouping
- Deriving Information

### Data Binning
- Data binning or data bucketing is a data preprocessing technique that involves grouping continuous data points into smaller, discrete intervals (bins)
- In some cases, binning can improve the performance of machine learning models. e.g. Decision Trees
- Another advantage is it reduces noise and outliers

In [57]:
df = pd.read_csv('/Users/bassel_instructor/Documents/Datasets/HousePrices.csv')
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [58]:
df['yr_built'].value_counts(bins=10) #deciles

yr_built
(2002.6, 2014.0]      848
(1979.8, 1991.2]      635
(1957.0, 1968.4]      607
(1945.6, 1957.0]      549
(1991.2, 2002.6]      534
(1968.4, 1979.8]      511
(1934.2, 1945.6]      273
(1922.8, 1934.2]      259
(1899.885, 1911.4]    207
(1911.4, 1922.8]      177
Name: count, dtype: int64

Using `cut()` 
- very sophisticated function for building bins
- cut does not summarize the data. Instead, it applies a range label for each data point

In [59]:
df['yr_built_bin'] = pd.cut(df['yr_built'], 10)
df[['yr_built', 'yr_built_bin']].head(10)

Unnamed: 0,yr_built,yr_built_bin
0,1955,"(1945.6, 1957.0]"
1,1921,"(1911.4, 1922.8]"
2,1966,"(1957.0, 1968.4]"
3,1963,"(1957.0, 1968.4]"
4,1976,"(1968.4, 1979.8]"
5,1938,"(1934.2, 1945.6]"
6,1976,"(1968.4, 1979.8]"
7,1989,"(1979.8, 1991.2]"
8,1985,"(1979.8, 1991.2]"
9,1945,"(1934.2, 1945.6]"


Build text categories instead of bin ranges. e.g. define if the house is old, average, or new

In [60]:
df['house_age'] = pd.cut(df['yr_built'], 4, labels=['very old', 'old', 'average', 'new'])
df[['yr_built', 'yr_built_bin', 'house_age']].head(10)

Unnamed: 0,yr_built,yr_built_bin,house_age
0,1955,"(1945.6, 1957.0]",old
1,1921,"(1911.4, 1922.8]",very old
2,1966,"(1957.0, 1968.4]",average
3,1963,"(1957.0, 1968.4]",average
4,1976,"(1968.4, 1979.8]",average
5,1938,"(1934.2, 1945.6]",old
6,1976,"(1968.4, 1979.8]",average
7,1989,"(1979.8, 1991.2]",new
8,1985,"(1979.8, 1991.2]",average
9,1945,"(1934.2, 1945.6]",old


Define Custom Ranges with Variable Lengths

In [61]:
year_bins = [1900, 1970, 1980, 1990, 2000, np.inf]

df['yr_built_bin_v2'] = pd.cut(df['yr_built'], year_bins)

df[['yr_built','house_age','yr_built_bin_v2']].sample(15)

Unnamed: 0,yr_built,house_age,yr_built_bin_v2
321,2005,new,"(2000.0, inf]"
1379,1925,very old,"(1900.0, 1970.0]"
3271,2005,new,"(2000.0, inf]"
2366,1908,very old,"(1900.0, 1970.0]"
3751,1922,very old,"(1900.0, 1970.0]"
2375,1967,average,"(1900.0, 1970.0]"
2660,1991,new,"(1990.0, 2000.0]"
1201,1979,average,"(1970.0, 1980.0]"
2736,1983,average,"(1980.0, 1990.0]"
1183,2008,new,"(2000.0, inf]"


### Deriving Features (From Existing Data)
It requires domain expertise and familiarity in the data elements.

Total Bedrooms

In [62]:
df['total_rooms'] = df['bedrooms'] + df['bathrooms']
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,yr_built,yr_renovated,street,city,statezip,country,yr_built_bin,house_age,yr_built_bin_v2,total_rooms
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,...,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA,"(1945.6, 1957.0]",old,"(1900.0, 1970.0]",4.5
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,...,1921,0,709 W Blaine St,Seattle,WA 98119,USA,"(1911.4, 1922.8]",very old,"(1900.0, 1970.0]",7.5
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,...,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA,"(1957.0, 1968.4]",average,"(1900.0, 1970.0]",5.0
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,...,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA,"(1957.0, 1968.4]",average,"(1900.0, 1970.0]",5.25
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,...,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA,"(1968.4, 1979.8]",average,"(1970.0, 1980.0]",6.5


Cost per Square-foot

In [63]:
df['price_per_sqft'] = round(df['price'] / df['sqft_living'])
df[['price','sqft_living','price_per_sqft']].head()

Unnamed: 0,price,sqft_living,price_per_sqft
0,313000.0,1340,234.0
1,2384000.0,3650,653.0
2,342000.0,1930,177.0
3,420000.0,2000,210.0
4,550000.0,1940,284.0


In [64]:
df.columns

Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city',
       'statezip', 'country', 'yr_built_bin', 'house_age', 'yr_built_bin_v2',
       'total_rooms', 'price_per_sqft'],
      dtype='object')

Get street name

In [65]:
df['street_name'] = df['street'].str.split().str[1]+' '+df['street'].str.split().str[2]
df['street_name'].value_counts()

street_name
8th Ave      33
30th Ave     30
33rd Ave     28
4th Ave      28
11th Ave     26
             ..
NE 33rd       1
SE 234th      1
8th Pl        1
279th Pl      1
S Creston     1
Name: count, Length: 1943, dtype: int64

**Bonus Example**

In [66]:
data={'Candy Variety' :['Chocolate Hearts', 'Sour Jelly', 'Candy Canes', 'Sour Jelly' , 'Fruit Drops'], 'Date and Time': ['09-02-2020 14:05' , '24-10-2020 18:00', '18-12-2020 20:13', '25-10-2020 10:00', '18-10-2020 15:46'],
'Day': ['Sunday', 'Saturday', 'Friday', 'Sunday', 'Monday']
    , 'Length': [3, 3.5, 3.5, 3.5, 5.0]
    , 'Breadth': [2,2,2.5,2,3]
    , 'Price' : [7.5, 7.6, 8, 7.6, 9]}

df_r = pd.DataFrame(data)
df_r

Unnamed: 0,Candy Variety,Date and Time,Day,Length,Breadth,Price
0,Chocolate Hearts,09-02-2020 14:05,Sunday,3.0,2.0,7.5
1,Sour Jelly,24-10-2020 18:00,Saturday,3.5,2.0,7.6
2,Candy Canes,18-12-2020 20:13,Friday,3.5,2.5,8.0
3,Sour Jelly,25-10-2020 10:00,Sunday,3.5,2.0,7.6
4,Fruit Drops,18-10-2020 15:46,Monday,5.0,3.0,9.0


In [67]:
df_r['Type of Day'] = np.where(df_r['Day'].isin(['Saturday', 'Sunday']), 'Weekend', 'Weekday')
df_r

Unnamed: 0,Candy Variety,Date and Time,Day,Length,Breadth,Price,Type of Day
0,Chocolate Hearts,09-02-2020 14:05,Sunday,3.0,2.0,7.5,Weekend
1,Sour Jelly,24-10-2020 18:00,Saturday,3.5,2.0,7.6,Weekend
2,Candy Canes,18-12-2020 20:13,Friday,3.5,2.5,8.0,Weekday
3,Sour Jelly,25-10-2020 10:00,Sunday,3.5,2.0,7.6,Weekend
4,Fruit Drops,18-10-2020 15:46,Monday,5.0,3.0,9.0,Weekday


## Normalization and Standardization

- Data scaling is the process of transforming features ina  dataset so they can have comparable magnitudes. 
- It's crucial for many machine learning algorithms and it impacts its overall accuracy and performance 
- We have 2 main methods:
    - Normalization (`MinMaxScaler`):
        - Scaling the data to a fixed range from 0 to 1 (min:0 to max:1)
        - Usage: suitable when the data is not normally distributed 
        - It's very popular for deep learning algorithms
    - Standardization (`StandardScaler`)
        - Transforming the data into a scale with a mean of 0 and standard deviation of 1
        - Usage: when the data is normally distributed 

In [68]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [69]:
df.columns

Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city',
       'statezip', 'country', 'yr_built_bin', 'house_age', 'yr_built_bin_v2',
       'total_rooms', 'price_per_sqft', 'street_name'],
      dtype='object')

In [70]:
X = df[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition']]

In [71]:
# normalization
MM_scaler = MinMaxScaler()

X_scaled = MM_scaler.fit_transform(X)
X_scaled

array([[0.01177134, 0.33333333, 0.1875    , ..., 0.        , 0.        ,
        0.5       ],
       [0.08965777, 0.55555556, 0.3125    , ..., 0.        , 1.        ,
        1.        ],
       [0.01286198, 0.33333333, 0.25      , ..., 0.        , 0.        ,
        0.75      ],
       ...,
       [0.01567898, 0.33333333, 0.3125    , ..., 0.        , 0.        ,
        0.5       ],
       [0.00764949, 0.44444444, 0.25      , ..., 0.        , 0.        ,
        0.5       ],
       [0.00829635, 0.33333333, 0.3125    , ..., 0.        , 0.        ,
        0.75      ]])

In [72]:
np.max(X_scaled), np.min(X_scaled)

(1.0, 0.0)

In [73]:
#standardization
stn_scaler = StandardScaler()

X_scaled = stn_scaler.fit_transform(X)
X_scaled

array([[-0.42386353, -0.44112227, -0.84320364, ..., -0.08500441,
        -0.30919434, -0.6671122 ],
       [ 3.2495981 ,  1.75970468,  0.43280154, ..., -0.08500441,
         4.83007931,  2.28641631],
       [-0.37242442, -0.44112227, -0.20520105, ..., -0.08500441,
        -0.30919434,  0.80965205],
       ...,
       [-0.23956224, -0.44112227,  0.43280154, ..., -0.08500441,
        -0.30919434, -0.6671122 ],
       [-0.61826787,  0.6592912 , -0.20520105, ..., -0.08500441,
        -0.30919434, -0.6671122 ],
       [-0.58775916, -0.44112227,  0.43280154, ..., -0.08500441,
        -0.30919434,  0.80965205]])

![sc](https://media.geeksforgeeks.org/wp-content/uploads/20200519001052/2020-05-18-21.png)


Here is a table that summarizes the key differences between MinMaxScaler and StandardScaler:

| Feature | MinMaxScaler | StandardScaler |
|---|---|---|
| Range | Scales the data to a fixed range, typically between 0 and 1 | Scales the data to have a mean of 0 and a standard deviation of 1 |
| Distribution | Not sensitive to the distribution of the data | Sensitive to the distribution of the data |
| Outliers | Sensitive to outliers | Not sensitive to outliers |
| Use cases | Good for datasets with a large range of values or neural networks| Good for datasets with a normal distribution or regression models|


**Robust Scaling:**

Robust scaling is a method used in statistics and machine learning to scale features by removing the median and scaling data based on the interquartile range (IQR). It is robust to outliers, meaning that extreme values in the data do not unduly influence the scaling. Robust scaling is particularly useful when dealing with datasets that contain outliers.

## Additional Data Transformation

### Log Transformation
- Log transformation is useful for handling skewed data or reducing the impact of outliers. 
- It applies the natural logarithm to the variable values and makes highly skewed distributions less skewed.
- Stabilize variance and make data more normal.


In [53]:
# Logarithmic transformation of the 'price' column
df['log_price'] = np.log(df['price'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


### Square Transformation
Square root transformation, like log transformation, effectively stabilizes variance and addresses skewed distributions. Although it's gentler than log transformation, it achieves the same objective.

In [54]:
# Square root transforming the 'price' variable
df['SquareRoot_price'] = np.sqrt(df['price'])

## Feature Hashing

Hashing is a technique that combines more than one category of a categorical variable into one single category. 

Feature hasing is a important technique for handling sparse and high-dimensional features in machine learning. 

- It is fast, simple, memory-efficient, and well-suited to online learning sceanrios. 
- It converts unique tokens into integers. 
- It operates on the exact strings that you provide as input and does not perform any linguistic analysis or preprocessing.

**Example1**: combining movies into categories for Netflix recommendation

**Example2**: Representing a text into a vector:
_Mark has a fun hoby. He goes fishing every weekend. Fishing is fun and relaxing._
 
Bag of words: 
assume an array of all words in a dictionary:
Vector has 1 entry per word in dictionary:
 
(0,1,0,0,....,2,0,1)
- 1 is occurence of Mark
- 2 is occurance of fishing
- 0 is occurance of bike

### **Business Logic**

Consider a column in the dataset corresponds to "zip codes". There are 182 zip codes in New York state and it is impractical to use each zip code as a separate category. 
So, to tackle this situation we can merge the zip codes according to localities.
This helps to reduce the number of categories and results in meaningful aggregation of zip code.

### **Frequency**

- It is not possible to apply business logic every time. In such cases, perform hashing using the frequency of occurrence.
- To combine levels using their frequency, we first look at the frequency distribution of each level and combine levels having frequency say less than 5% of total observation (can be changed based on distribution).
- This is an effective method to deal with rare levels.
- We can also combine levels by considering the response rate of each level. We can simply combine levels having similar response rates into the same group.

Exercise

In [11]:
from sklearn.feature_extraction import FeatureHasher

In [12]:
#Select the cell and click on run icon
import pandas as pd
game_df = pd.read_csv("datasets/vgsales.csv", encoding="utf-8")
game_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [13]:
#Select the cell and click on run icon
game_df.columns

Index(['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'],
      dtype='object')

**Observations from the above output:**
>The column names such as Rank, Name, Platform, Year, Genre, Publisher, NA_Sales,
       EU_Sales, JP_Sales, Other_Sales, and Global_Sales present in the **`game_df`** dataframe.

In [14]:
game_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]

Unnamed: 0,Name,Platform,Year,Genre,Publisher
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo
5,Tetris,GB,1989.0,Puzzle,Nintendo
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo


In [19]:
#let's get unique values and length of Genre
u_generes = game_df["Genre"].unique()
print("Total game generes:", len(u_generes))
print(u_generes)

Total game generes: 12
['Sports' 'Platform' 'Racing' 'Role-Playing' 'Puzzle' 'Misc' 'Shooter'
 'Simulation' 'Action' 'Fighting' 'Adventure' 'Strategy']


In [26]:
fh = FeatureHasher(n_features=12, input_type='string')


In [29]:
hashed_features = fh.fit_transform(game_df["Genre"])

ValueError: Samples can not be a single string. The input must be an iterable over iterables of strings.

In [None]:
hashed_features = hashed_features.toarray()
new_game_df = pd.concat([game_df[['Name', 'Genre']], pd.DataFrame(hashed_features)], axis=1)

new_game_df.head()

In [25]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher

# Sample data
data = pd.DataFrame({'category': ['A', 'B', 'C', 'A', 'B', 'A']}) 

# Create the hasher 
hasher = FeatureHasher(n_features=10)

# Hash the categorical column  
hashed_features = hasher.transform(data['category'])

# Convert to DataFrame    
hashed_df = pd.DataFrame(hashed_features.toarray())

print(hashed_df)

AttributeError: 'str' object has no attribute 'items'

In [30]:
# Import the pandas and sklearn libraries
import pandas as pd
from sklearn.feature_extraction import FeatureHasher

# Create a dataframe with some dummy data
df = pd.DataFrame({
    'color': ['red', 'green', 'blue', 'red', 'green', 'blue'],
    'shape': ['circle', 'square', 'triangle', 'circle', 'square', 'triangle']
})

# Create a FeatureHasher object with 10 features and input type as string
h = FeatureHasher(n_features=10, input_type='string')

# Transform the dataframe into a sparse matrix of hashed features
f = h.transform(df.values)

# Convert the sparse matrix into a dense array
f = f.toarray()

# Print the array
print(f)


[[ 1.  0.  0.  0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1. -1.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1. -1.  0.  0.  0.  0.]]
