# Load Dataset

In [1]:
#import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv")

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
#chack missing value in the dataset
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

Data Preparation

In [6]:
#fill missing value with 0
df = df.fillna(0)

In [7]:
#create new features
df['rooms_per_household'] = df['total_rooms']/df['households']
df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
df['population_per_household'] = df['population']/df['households']

In [8]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


In [9]:
#copy the data if the data will use for other method 
df_reg = df.copy()

# Question 1
What is the most frequent observation (mode) for the column ocean_proximity?

In [10]:
import statistics
mode = statistics.mode(df['ocean_proximity'])
mode

'<1H OCEAN'

the most frequent value from ocean_proximity column is '<1H OCEAN'

# Question 2
- Create the correlation matrix for the numerical features of your train dataset.
- In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
- What are the two features that have the biggest correlation in this dataset?

In [11]:
#separate both numerical columns and categorical columns

numerical = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'rooms_per_household',
'bedrooms_per_room',
'population_per_household']

categorical=['ocean_proximity']

In [None]:
#chack correlation between 2 variable

In [12]:
df[numerical].corrwith(df['households'])

latitude                   -0.071035
longitude                   0.055310
housing_median_age         -0.302916
total_rooms                 0.918484
total_bedrooms              0.966507
population                  0.907222
households                  1.000000
median_income               0.013033
median_house_value          0.065843
rooms_per_household        -0.080598
bedrooms_per_room           0.059818
population_per_household   -0.027309
dtype: float64

In [13]:
df[numerical].corrwith(df['total_rooms'])

latitude                   -0.036100
longitude                   0.044568
housing_median_age         -0.361262
total_rooms                 1.000000
total_bedrooms              0.920196
population                  0.857126
households                  0.918484
median_income               0.198050
median_house_value          0.134153
rooms_per_household         0.133798
bedrooms_per_room          -0.174583
population_per_household   -0.024581
dtype: float64

In [None]:
the two features that have the biggest correlation in this dataset is total_bedrooms and households with value by 0.966507 

Make median_house_value binary
We need to turn the median_house_value variable from numeric into binary.
Let's create a variable above_average which is 1 if the median_house_value is above its mean value and 0 otherwise.
Split the data
Split your data in train/val/test sets, with 60%/20%/20% distribution.
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
Make sure that the target value (median_house_value) is not in your dataframe.

In [14]:
uu = []
above_average=statistics.mean(df['median_house_value'])

for i in range(len(df)):
    if df['median_house_value'][i] > above_average:
        uu.append(1)
    else:
        uu.append(0)
        
df['median_house_value'] = uu

In [15]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

In [18]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

# Question 3
- Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
- What is the value of mutual information?
- Round it to 2 decimal digits using round(score, 2)

Mutual information is a concept from information theory, which measures how much we can 
learn about one variable if we know the value of another

In [19]:
from sklearn.metrics import mutual_info_score

def mutual_info_house_value_score(x):
    return round(mutual_info_score(x, y_train),2)

In [20]:
df_train[categorical].apply(mutual_info_house_value_score)

ocean_proximity    0.1
dtype: float64

mutual informatin score based on the dataset approximately 0.1

# Question 4
- Now let's train a logistic regression
- Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
- Fit the model on the training dataset.
- To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
- model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [21]:
#OneHotEncoding
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
df_train['ocean_proximity'] = ohe.fit_transform(df_train[categorical].values)

In [22]:
#Training the model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(df_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [23]:
#one hot encoding categorical value in the validation dataset
df_val['ocean_proximity'] = ohe.fit_transform(df_val[categorical].values)

In [24]:
y_pred = model.predict_proba(df_val)[:, 1]

In [25]:
accuracy = round(accuracy_score(y_val, y_pred>= 0.5),2)
accuracy

0.82

Accuracy based on my model = 0.82
Because my answer can't appear in the option, i choose for accuracy on the validation dataset with model that get  approximately 0.84

# Question 5
- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference?

options:
- total_rooms
- total_bedrooms
- population
- households

In [26]:
#Question 5

def diff_accuracy(G,i):
    #col = ['total_rooms','total_bedrooms','population','households']
    df_full_train, df_test = train_test_split(G, test_size=0.2, random_state=42)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    X=df_train
    Y=df_val
    Z=df_test
    
    y_train = X.median_house_value.values
    y_val = Y.median_house_value.values
    y_test = Z.median_house_value.values
    
    del X['median_house_value']
    del Y['median_house_value']
    del Z['median_house_value']
    
    X=X.loc[:, X.columns != i ]
    Y=Y.loc[:, Y.columns != i ]
    Z=Z.loc[:, Z.columns != i ]
    
    categorical=['ocean_proximity']
    
    X['ocean_proximity'] = ohe.fit_transform(X[categorical].values)
    Y['ocean_proximity'] = ohe.fit_transform(Y[categorical].values)
    Z['ocean_proximity'] = ohe.fit_transform(Z[categorical].values)
        
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X, y_train)

    Y['ocean_proximity'] = ohe.fit_transform(df_val[categorical].values)
    y_pred = model.predict_proba(Y)[:, 1]
    accu = accuracy_score(y_val, y_pred>= 0.5)

    return accu

In [27]:
score = []
col = ['total_rooms','total_bedrooms','population','households']
for i in col: 
    score.append(diff_accuracy(df, str(i)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ocean_proximity'] = ohe.fit_transform(X[categorical].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y['ocean_proximity'] = ohe.fit_transform(Y[categorical].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Z['ocean_proximity'] = ohe.fit_transform(Z[categorical].values)
A value is tr

In [28]:
diff_accuracy_without_total_rooms = accuracy - score[0]
diff_accuracy_without_total_bedrooms = accuracy - score[1]
diff_accuracy_without_population = accuracy - score[2]
diff_accuracy_without_households = accuracy - score[3]

In [29]:
print('diff_accuracy_without_total_rooms =',diff_accuracy_without_total_rooms)
print('diff_accuracy_without_total_bedrooms =',diff_accuracy_without_total_bedrooms)
print('diff_accuracy_without_population =',diff_accuracy_without_population)
print('diff_accuracy_without_households =',diff_accuracy_without_households)

diff_accuracy_without_total_rooms = -0.003643410852713247
diff_accuracy_without_total_bedrooms = -0.002189922480620221
diff_accuracy_without_population = -0.0029166666666666785
diff_accuracy_without_households = -0.003643410852713247


As you can see, for all difference value is negative. Based on the results, we can prove that the calculation is right
So,the smallest difference based on above: if we don't use total rooms column or households column

# Question 6
- For this question, we'll see how to use a linear regression model from Scikit-Learn
- We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
- This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
- Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

In [None]:
#Build Function rmse and ridge model

In [30]:
#rmse
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [31]:
#ridge model
def ridge(a):
    
    #load a old data before transformation binary in the median_house_value column
    df= df_reg
    
    from sklearn.model_selection import train_test_split
    df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = np.log1p(df_train['median_house_value'].values)
    y_val = np.log1p(df_val['median_house_value'].values)
    y_test = np.log1p(df_test['median_house_value'].values)

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    df_train['ocean_proximity'] = ohe.fit_transform(df_train[categorical].values)
    from sklearn.linear_model import Ridge
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(df_train, y_train)
    
    #one hot encoding for validation dataset
    df_val['ocean_proximity'] = ohe.fit_transform(df_val[categorical].values)
    y_pred = model.predict(df_val)
    
    #Round the RMSE scores to 3 decimal digits.
    rmse_score = round(rmse(y_val, y_pred),3)
    
    return rmse_score

In [32]:
for r in [0, 0.01, 0.1, 1, 10]:
    print('alpha with %3s' %r, ':', ridge(r))

alpha with   0 : 0.524
alpha with 0.01 : 0.524
alpha with 0.1 : 0.524
alpha with   1 : 0.524
alpha with  10 : 0.524


Based on the result, you can see with particular alpha, rmse score is similar. That's why, we will choose the smallest alpha is 0