In [1]:
# import necessary algorithms
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# load in the required dataset
data_clf = pd.read_csv('survey_lung_cancer.csv')
data_reg = pd.read_csv('housing.csv')

# CLASSIFICATION PROBLEM

In [3]:
data_clf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

## PREPROCESSING

In [4]:
# import the algorithm
from sklearn.preprocessing import LabelBinarizer
# instatiate the algorithm
lb = LabelBinarizer()

In [12]:
data_clf['GENDER'] = lb.fit_transform(data_clf['GENDER'])
data_clf['LUNG_CANCER'] = lb.fit_transform(data_clf['LUNG_CANCER'])

In [13]:
# split the dataset
X = data_clf.drop('LUNG_CANCER', axis = 1)
y = data_clf['LUNG_CANCER']

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2, random_state= 2022)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(247, 15)
(62, 15)
(247,)
(62,)


## MODEL BUILDING

In [15]:
# import the algorithm
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [16]:
data_clf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   GENDER                 309 non-null    int32
 1   AGE                    309 non-null    int64
 2   SMOKING                309 non-null    int64
 3   YELLOW_FINGERS         309 non-null    int64
 4   ANXIETY                309 non-null    int64
 5   PEER_PRESSURE          309 non-null    int64
 6   CHRONIC DISEASE        309 non-null    int64
 7   FATIGUE                309 non-null    int64
 8   ALLERGY                309 non-null    int64
 9   WHEEZING               309 non-null    int64
 10  ALCOHOL CONSUMING      309 non-null    int64
 11  COUGHING               309 non-null    int64
 12  SHORTNESS OF BREATH    309 non-null    int64
 13  SWALLOWING DIFFICULTY  309 non-null    int64
 14  CHEST PAIN             309 non-null    int64
 15  LUNG_CANCER            309 non-null    i

In [17]:
# instantantiate the model
rfc = RandomForestClassifier()
rfg = RandomForestRegressor()
# train the model
model = rfc.fit(X_train, y_train)
model_1 = rfg.fit(X_train, y_train)
# evaluate the model
y_pred = rfc.predict(X_test)
y_pred_1 = rfg.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [19]:
confusion_matrix(y_test, y_pred)

array([[ 5,  2],
       [ 1, 54]], dtype=int64)

In [20]:
accuracy_score(y_test, y_pred)

0.9516129032258065

In [None]:
# MAE,MSE and RMSE

# REGRESSION PROBLEM

In [22]:
data_reg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [46]:
data_reg['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

## PREPROCESSING

In [23]:
from sklearn.preprocessing import LabelEncoder
# instantiate the model
lab_enc = LabelEncoder()

In [24]:
data_reg['ocean_proximity'] = lab_enc.fit_transform(data_reg['ocean_proximity'])

In [25]:
data_reg['ocean_proximity'].value_counts()

0    9136
1    6551
4    2658
3    2290
2       5
Name: ocean_proximity, dtype: int64

In [26]:
data_reg.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [27]:
data_reg['total_bedrooms'] = data_reg['total_bedrooms'].fillna(data_reg['total_bedrooms'].mean())

In [28]:
# Evaluate your dataset again
data_reg.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [29]:
data_reg.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,3
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,3
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,3
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,3
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,3


# MODEL SELECTION

In [30]:
X = data_reg.drop('ocean_proximity', axis = 1)
y = data_reg['ocean_proximity']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2022)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(16512, 9)
(4128, 9)
(16512,)
(4128,)


In [32]:
data_reg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  int32  
dtypes: float64(9), int32(1)
memory usage: 1.5 MB


## Random Forest Regressor

In [51]:
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
ada = AdaBoostRegressor(n_estimators=200)
bag = BaggingRegressor()
rfg = RandomForestRegressor(n_estimators = 500, criterion= 'mae')
grd = GradientBoostingRegressor()

In [None]:
# train the model here
rfg.fit(X_train, y_train)
# predict
y_pred = rfg.predict(X_test)

In [None]:
# Evaluate 
from sklearn.metrics import mean_squared_error
import math

In [None]:
mse = mean_squared_error(y_test, y_pred)

rmse = math.sqrt(mse)


In [None]:
rmse