In [20]:
import pandas as pd
import numpy as np

### Handling Outliers

##### Methods of Detecting Outliers

##### Example

In [30]:
import numpy as np

# We create a simple dataset with an outlier in the salary column
data = {
    'Name': ['Alice','Bob','Charlie','David','Eve'],
    'Salary': [50000,60000,70000, 1000000,75000] # David has a much larger salary (Outlier)
}

df_outliers = pd.DataFrame(data) # Converting the dictionary into a pandas DataFrame

In [31]:
df_outliers

Unnamed: 0,Name,Salary
0,Alice,50000
1,Bob,60000
2,Charlie,70000
3,David,1000000
4,Eve,75000


#### DETECTING OUTLIERS USING INTERQUATILE RANGE (IQR)

#### What are percentiles ?

#### 1. Interquartile Range (IQR)

##### interquatile Range (IQR) = Q3 - Q1

##### To get Q1: We find the position;

##### Position = 25 / 100 x (n + 1)

#### 2. Outlier Detection using IQR

##### Example

In [37]:
# Calculating Q1 (25th percentile) and Q3 (75th percentile)

Q1 = df_outliers['Salary'].quantile(0.25)   # Calculating for the First quartile.

Q3 = df_outliers['Salary'].quantile(0.75)   # Caculating for the Third quatile.

IQR = Q3 - Q1 # Checking for interquatile range

# Define the Lower and Upper bounds for detecting outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [38]:
Q1

np.float64(60000.0)

In [39]:
Q3

np.float64(75000.0)

In [40]:
IQR

np.float64(15000.0)

#### Decting Outliers Using the IQR Method.

In [35]:
# Detect outlier values outside the lower and upper bounds
outliers_iqr = df_outliers[(df_outliers['Salary'] < lower_bound) | (df_outliers['Salary'] > upper_bound)]
print(outliers_iqr)

    Name   Salary
3  David  1000000


#### Decting Outliers Using Standard Deviation

In [42]:
# calculate the mean and standard deviations of the salary column 
mean_salary = df_outliers['Salary'].mean()
std_salary = df_outliers['Salary'].std()

# Defining the threshold for outliers (which is 3 standard deviation from the mean)
std_threshold = 1

# Having established our condition for determining our outlier, which is the 'mean', 'standard deviation' and setting our threshood at '1'.
# we can now proceed to checking out for the outlier in the 'salary' data set.

In [44]:
# Detecting Outliers using values that are more than 3 standard deviation away from the mean

outliers_std = df_outliers[np.abs(df_outliers['Salary'] - mean_salary) > std_threshold * std_salary]

print("\nOutliers detected using Standard Deviation:")
print(outliers_std)


Outliers detected using Standard Deviation:
    Name   Salary
3  David  1000000


#### Handling Outliers Detected by IQR

In [51]:
# capping the outliers by replacing values that are above the upper bounds or below the lower bound
df_outliers['Salary_IQR_capped'] = np.where(df_outliers['Salary'] > upper_bound, upper_bound,
                                            np.where(df_outliers['Salary'] < lower_bound, lower_bound, df_outliers['Salary']))

In [52]:
df_outliers['Salary_IQR_capped']

0    50000.0
1    60000.0
2    70000.0
3    97500.0
4    75000.0
Name: Salary_IQR_capped, dtype: float64

#### Calculating Bounds

## FEATURE ENGINEERING

In [89]:
nigeria_houses = pd.read_csv("nigeria_houses_data.csv")
nigeria_houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24326 entries, 0 to 24325
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   bedrooms       24326 non-null  int64  
 1   bathrooms      24326 non-null  int64  
 2   toilets        24326 non-null  int64  
 3   parking_space  24326 non-null  int64  
 4   title          24325 non-null  object 
 5   town           24325 non-null  object 
 6   state          24325 non-null  object 
 7   price          24318 non-null  float64
dtypes: float64(1), int64(4), object(3)
memory usage: 1.5+ MB


#### Feature Engineering
##### Definition

Feature engineering is the  process of transforming raw data into meaningful input features that improves the performance of Machine learning models.

##### Explanation:

##### It involves creating, modifying, or selecting the most relevant features from the dataset. common techniques includes:

1. Handling missing values (e.g imputation or Deletion).
2. Encoding categorical variables (e.g one-hot encoding or get_dummies method).
3. Scaling numerical values (e.g normalization).
4. creating new features (e.g aggregation, ratios).
5. Removing irrelivant or redundant features.

##### Good feature engineering can significantly enhance model accuracy and efficency.


#### Spliting Features and Target Variable

When preparing data for machine learning, it's important to the features (independent variables) from the target variable (dependent variable).
This step ensures that the model has the correct input data (features) and the correct output to predict (target).

**Features:** These are the input variables that the model uses to learn patterns and make predictions. In the code , x = nigeria_houses.drop("price", axis=1) removes the "price" column from the nigeria_houses DataFrame, leaving the remaining column as features, stored in x.

**Target Variable:** This is the output variables that the model aims to predict. In the code, y = nigeria_houses["price"] selects the "price" column from the DataFrame and stores it in y, making it the target variable. 

#### Code Explanation

In [90]:
x = nigeria_houses.drop("price", axis=1) # Saparate the features from the target variable by removing the "price" column.
y = nigeria_houses["price"] # selects the "price" column as the target varaible

In [91]:
# Calling the x variable which is the "Features" excluding the "price" column
x

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,title,town,state
0,6,5,5,4,Detached Duplex,Mabushi,Abuja
1,4,5,5,4,Terraced Duplexes,Katampe,Abuja
2,4,5,5,4,Detached Duplex,Lekki,Lagos
3,4,4,5,6,Detached Duplex,Ajah,Lagos
4,4,4,5,2,Semi Detached Duplex,Lekki,Lagos
...,...,...,...,...,...,...,...
24321,2,2,2,4,Block of Flats,Kabusa,Abuja
24322,4,5,5,4,Block of Flats,Ado-Odo/Ota,Ogun
24323,4,5,5,4,Detached Duplex,Lekki,Lagos
24324,3,4,4,3,Block of Flats,Victoria Island (VI),Lagos


In [92]:
x = nigeria_houses.drop("price", axis=1)
y = nigeria_houses["price"]

In [93]:
# Calling the y variable which is the "Target variable" which is the "price" column.
y.head()

0    450000000.0
1    800000000.0
2    120000000.0
3     40000000.0
4     75000000.0
Name: price, dtype: float64

In [82]:
nigeria_houses = pd.read_csv("nigeria_houses_data.csv")

In [94]:
# converting the price column from "price" to "price" divided by 1 exponential 6 and storing the result in a new variable (column) 'price_millions'. 
nigeria_houses['price_millions'] = nigeria_houses['price'] / 1e6

# dropping the old "price" column in replacement for the new "price_millions" column 
nigeria_houses.drop(columns=['price'], inplace=True)

In [95]:
# Saparate the features from the target variable by removing the "price_millions" column.
x = nigeria_houses.drop('price_millions', axis=1)

# selecting the "price_millions" column as the target varaible.
y = nigeria_houses['price_millions']

In [96]:
# Calling the x variable which is now the "Features" excluding the "price_million" column
x.head()

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,title,town,state
0,6,5,5,4,Detached Duplex,Mabushi,Abuja
1,4,5,5,4,Terraced Duplexes,Katampe,Abuja
2,4,5,5,4,Detached Duplex,Lekki,Lagos
3,4,4,5,6,Detached Duplex,Ajah,Lagos
4,4,4,5,2,Semi Detached Duplex,Lekki,Lagos


In [97]:
# Calling the y variable which is now the "Target variable" which is the "price_million" column.
y.head()

0    450.0
1    800.0
2    120.0
3     40.0
4     75.0
Name: price_millions, dtype: float64

#### Importing Libraries for One-Hot-Encoding
Before we can apply machine learning techniques, we need to ensure that the data is in the right format. One commmon preprocessing step is the "one-hot encoding", which is used to convert categorical data into format that can be feed to the machine learning algorithms.

**sklearn Library:** The sklearn (or scikit-learn) library is a popular Python library for machine learning. It provides tools for
1. Data Preprocessing
2. Model Selection
3. And various algorithms for classification, regression and more

**One-Hot-Encoder:** The OneHotEncoder class from the sklearn.preprocessing is used to convert categorical features into a one-hot numerical format. This allows the model to work with categorical data by representing each category as a binary vector.

#### Code Explanation

In [101]:
# importing the sckit-learn Library
import sklearn as sk

# Imports the OneHotEncoder class for data preprocessing
from sklearn.preprocessing import OneHotEncoder  

In [102]:
from sklearn.compose import ColumnTransformer

In [103]:
# selecting our categorical variables for transformation
categorical_features = nigeria_houses.select_dtypes(include=['object']).columns.tolist()

In [105]:
# calling the variable "categorical_features"
categorical_features

['title', 'town', 'state']

In [106]:
# Applying the "one-hot" encoding method in the transformation of our categorical variable.
one_hot = OneHotEncoder()
transformer = ColumnTransformer(
    [("one_hot", one_hot, categorical_features)],
    remainder="passthrough"
)

In [107]:
transformed_x = transformer.fit_transform(x)  # The "x" here being the name of the name we gave to the variable housing our "Features" above. 

In [108]:
# calling the variable used for storing our Transformed categorical variables
transformed_x

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 170282 stored elements and shape (24326, 228)>

In [109]:
# Converting the transformed variable to a data frame
pd.DataFrame(transformed_x)

Unnamed: 0,0
0,<Compressed Sparse Row sparse matrix of dtype ...
1,<Compressed Sparse Row sparse matrix of dtype ...
2,<Compressed Sparse Row sparse matrix of dtype ...
3,<Compressed Sparse Row sparse matrix of dtype ...
4,<Compressed Sparse Row sparse matrix of dtype ...
...,...
24321,<Compressed Sparse Row sparse matrix of dtype ...
24322,<Compressed Sparse Row sparse matrix of dtype ...
24323,<Compressed Sparse Row sparse matrix of dtype ...
24324,<Compressed Sparse Row sparse matrix of dtype ...


#### METHOD 2

The second method of categorical variables into machine learning algorithn format is the **get dummies** methos or format.
this method will be applied on our previous example below:

In [111]:
dummies = pd.get_dummies(nigeria_houses[["title","town","state"]]) # selecting our categorical variables using another method i.e calling the names
                                                                # of the columns housing the categorical variables individually. and subsequently
                                                                # converting then to machine format using the ".get_dummies()" function.

# calling our "dummies" variable with the ".head()" function
dummies.head()


Unnamed: 0,title_Block of Flats,title_Detached Bungalow,title_Detached Duplex,title_Semi Detached Bungalow,title_Semi Detached Duplex,title_Terraced Bungalow,title_Terraced Duplexes,town_Aba,town_Abeokuta North,town_Abeokuta South,...,state_Kogi,state_Kwara,state_Lagos,state_Nasarawa,state_Niger,state_Ogun,state_Osun,state_Oyo,state_Plateau,state_Rivers
0,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


In [112]:
dummies = pd.get_dummies(nigeria_houses[["title","town","state"]])

# converting the output from the ".get_dummies()" function, from booleon i.e True, False to 0, 1 inteager
dummies = dummies.astype(int)

In [113]:
# calling our newly converted dummies with the .head() function
dummies.head()

Unnamed: 0,title_Block of Flats,title_Detached Bungalow,title_Detached Duplex,title_Semi Detached Bungalow,title_Semi Detached Duplex,title_Terraced Bungalow,title_Terraced Duplexes,town_Aba,town_Abeokuta North,town_Abeokuta South,...,state_Kogi,state_Kwara,state_Lagos,state_Nasarawa,state_Niger,state_Ogun,state_Osun,state_Oyo,state_Plateau,state_Rivers
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [114]:
# just checking our initia Data Frame.
nigeria_houses

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,title,town,state,price_millions
0,6,5,5,4,Detached Duplex,Mabushi,Abuja,450.0
1,4,5,5,4,Terraced Duplexes,Katampe,Abuja,800.0
2,4,5,5,4,Detached Duplex,Lekki,Lagos,120.0
3,4,4,5,6,Detached Duplex,Ajah,Lagos,40.0
4,4,4,5,2,Semi Detached Duplex,Lekki,Lagos,75.0
...,...,...,...,...,...,...,...,...
24321,2,2,2,4,Block of Flats,Kabusa,Abuja,15.0
24322,4,5,5,4,Block of Flats,Ado-Odo/Ota,Ogun,25.0
24323,4,5,5,4,Detached Duplex,Lekki,Lagos,68.0
24324,3,4,4,3,Block of Flats,Victoria Island (VI),Lagos,78.0
