# Step 1: Install Libraries

In [None]:
!pip install matplotlib seaborn



# Step 2: Import Libraries

In [None]:
import pandas as pd
import numpy as np

import math

from io import StringIO

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KernelDensity

import matplotlib.pyplot as plt
import seaborn as sns

# Step 3: Get raw Dataset

In [None]:
raw_train_set = """"""

In [None]:
raw_test_set = """"""

# Step 4: Convert Raw Data to Pandas Frame

In [None]:
pd.set_option('display.max_rows', None)
train_set = pd.read_csv(StringIO(raw_train_set))
test_set = pd.read_csv(StringIO(raw_test_set))

# Step 5: Divide Dataset into Train_set and Validation_set

In [None]:
train_set, validation_set = train_test_split(train_set, test_size=0.2, random_state=42)

# Step 6: Calculate Pearson Correlation

In [None]:
def calculate_pearson_correlation(feature_list, target_list):
    feature_mean = sum(feature_list) / len(feature_list)
    target_mean = sum(target_list) / len(target_list)

    feature_diff = []
    for item in feature_list:
        feature_diff.append(item - feature_mean)

    target_diff = []
    for item in target_list:
        target_diff.append(item - target_mean)

    multiply_diff = []
    for index in range(len(feature_diff)):
        multiply_diff.append(feature_diff[index] * target_diff[index])

    square_feature_diff = []
    for item in feature_diff:
        square_feature_diff.append(item*item)

    square_target_diff = []
    for item in target_diff:
        square_target_diff.append(item*item)

    sum_multiply_diff = sum(multiply_diff)
    sum_square_feature_diff = sum(square_feature_diff)
    sum_square_target_diff = sum(square_target_diff)

    return sum_multiply_diff/math.sqrt(sum_square_feature_diff * sum_square_target_diff)

In [None]:
all_cols = [train_set[col].tolist() for col in train_set.columns]
pearson_correlation = []
for anyList in all_cols[0:-1]:
    pearson_correlation.append(calculate_pearson_correlation(anyList, all_cols[-1]))

In [None]:
pearson_correlation

[0.6754163075637131,
 0.1275110626317921,
 0.1354915336600122,
 -0.0402437579970883,
 -0.027798655756735374,
 -0.03254710712812415,
 -0.12915768317543397,
 -0.04409562601510458]

# Step 7: Calculate Mutual Information with KDE

In [None]:
def mutual_information_kde(x, y, bandwidth=0.1):
    x = x.reshape(-1, 1)
    y = y.reshape(-1, 1)
    xy = np.hstack((x, y))

    kde_joint = KernelDensity(bandwidth=bandwidth).fit(xy)
    log_prob_joint = kde_joint.score_samples(xy)

    kde_x = KernelDensity(bandwidth=bandwidth).fit(x)
    log_prob_x = kde_x.score_samples(x)

    kde_y = KernelDensity(bandwidth=bandwidth).fit(y)
    log_prob_y = kde_y.score_samples(y)

    mi = np.mean(log_prob_joint - log_prob_x - log_prob_y)
    return mi

In [None]:
mutual_information = []
for anyList in all_cols[0:-1]:
    mutual_information.append(mutual_information_kde(np.array(anyList), np.array(all_cols[-1])))

In [None]:
mutual_information

[0.3825063785343686,
 0.07163783891708123,
 0.13469740882072623,
 0.028059220216915766,
 1.2674582645373336,
 0.10463400815830431,
 0.2302446607972519,
 0.31016845897890644]

# Step 8: Feature Selection

In [None]:
cols = list(train_set.columns)[0:-1]
cols_copy = cols.copy()
cols_copy2 = cols.copy()
pearson_correlation_copy = []
mutual_information_copy = mutual_information.copy()

for element in pearson_correlation:
    if element < 0:
        pearson_correlation_copy.append(element * -1)
    else:
        pearson_correlation_copy.append(element)

pearson_correlation_features = []
mutual_information_features = []
for index in range(4):
    idx = pearson_correlation.index(max(pearson_correlation))
    pearson_correlation_features.append(cols_copy[idx])
    del cols_copy[idx]
    del pearson_correlation[idx]
    idx2 = mutual_information.index(max(mutual_information))
    mutual_information_features.append(cols_copy2[idx2])
    del cols_copy2[idx2]
    del mutual_information[idx2]

In [None]:
pearson_correlation_features

['MedInc', 'AveRooms', 'HouseAge', 'Population']

In [None]:
mutual_information_features

['Population', 'MedInc', 'Longitude', 'Latitude']

# Step 9: Train Linear regression model with Pearson Correlation Features

In [None]:
features_train_pc = train_set[pearson_correlation_features]
target_train_pc = train_set['MedHouseVal']

features_validation_pc = validation_set[pearson_correlation_features]
target_validation_pc = validation_set['MedHouseVal']

In [None]:
model_pc = LinearRegression()
model_pc.fit(features_train_pc, target_train_pc)

In [None]:
target_validation_predict_pc = model_pc.predict(features_validation_pc)

# Step 10: Train Linear regression model with  Mutual Information Features

In [None]:
features_train_mi = train_set[mutual_information_features]
target_train_mi = train_set['MedHouseVal']

features_validation_mi = validation_set[mutual_information_features]
target_validation_mi = validation_set['MedHouseVal']

In [None]:
model_mi = LinearRegression()
model_mi.fit(features_train_mi, target_train_mi)

In [None]:
target_validation_predict_mi = model_mi.predict(features_validation_mi)

# Step 11: Calculate measures for PC and MI

In [None]:
mse_pc = mean_squared_error(target_validation_pc, target_validation_predict_pc)
r2_pc = r2_score(target_validation_pc, target_validation_predict_pc)

In [None]:
mse_mi = mean_squared_error(target_validation_mi, target_validation_predict_mi)
r2_mi = r2_score(target_validation_mi, target_validation_predict_mi)

In [None]:
mse_pc

0.6977652123033251

In [None]:
r2_pc

0.4794349527562398

In [None]:
mse_mi

0.6043053087589882

In [None]:
r2_mi

0.5491603535731632

# Step 12:  Mutual Information Features works better!

In [None]:
features_test_mi = test_set[mutual_information_features]
target_test_predict_mi = model_mi.predict(features_test_mi)
test_set['MedHouseValPredicted'] = target_test_predict_mi

In [None]:
test_set.head(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseValPredicted
0,2.6667,52.0,5.652174,1.038647,604.0,2.917874,37.34,-121.87,2.000043
1,6.6345,52.0,5.603093,0.85567,650.0,3.350515,37.34,-121.87,3.47001
2,2.4306,39.0,4.899209,1.06917,1990.0,3.932806,37.34,-121.87,1.868526
3,2.6312,52.0,3.819383,1.022026,1264.0,5.568282,37.34,-121.88,1.971099
4,2.9222,27.0,3.664921,1.084817,3019.0,3.161257,37.35,-121.87,2.013218
5,2.4968,46.0,4.374622,0.996979,1094.0,3.305136,37.35,-121.86,1.911364
6,2.3277,52.0,3.587558,0.976959,1580.0,3.640553,37.35,-121.87,1.838395
7,2.5947,29.0,3.217816,0.991458,6234.0,3.803539,37.34,-121.86,1.789614
8,3.0089,40.0,5.246544,1.170507,1718.0,3.958525,37.34,-121.86,2.086455
9,2.375,41.0,2.146739,0.891304,549.0,2.983696,37.33,-121.88,1.903743


# Step 13: Export the predicted EXCEL

In [None]:
test_set.to_excel('MutualInformationResult.xlsx', index=False)