In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

df_housing_train = pd.read_csv("../ynov-data/train_housing_train.csv")
df_housing_valid = pd.read_csv("../ynov-data/train_housing_valid.csv")
df_housing_test = pd.read_csv('../ynov-data/test_housing.csv')

In [5]:
def fix_null_values(column):
    mean_value = column.mean()
    fixed_column = column.fillna(mean_value)    
    return fixed_column

In [6]:
median_house_value = df_housing_train[['median_house_value']]
median_income = fix_null_values(df_housing_train[['median_income']])
ocean_proximity_encoded = pd.get_dummies(df_housing_train['ocean_proximity'], prefix='ocean')
rooms_per_habitant = fix_null_values(df_housing_train['total_rooms']) / fix_null_values(df_housing_train['population'])
bedrooms_per_households = fix_null_values(df_housing_train['total_bedrooms']) / fix_null_values(df_housing_train['households'])
longitude = df_housing_train[['longitude']]
latitude = df_housing_train[['latitude']]
households_per_habitant = fix_null_values(df_housing_train['population']) / fix_null_values(df_housing_train['households'])
total_rooms = fix_null_values(df_housing_train['total_rooms'])
total_bedrooms = fix_null_values(df_housing_train['total_bedrooms'])

In [7]:
X = np.column_stack((median_income, rooms_per_habitant, 
                     ocean_proximity_encoded, bedrooms_per_households, 
                     longitude, latitude, 
                     households_per_habitant, total_rooms,
                    total_bedrooms))
Y = median_house_value

model = LinearRegression()
model.fit(X, Y)

In [8]:
median_house_pred = model.predict(X)
print("Erreur quadratique moyenne (MSE) :", mean_squared_error(median_house_value, median_house_pred))
print("RMSE :", root_mean_squared_error(median_house_value, median_house_pred))
print("Score R² :", r2_score(median_house_value, median_house_pred))

Erreur quadratique moyenne (MSE) : 4951205507.875488
RMSE : 70364.80304723015
Score R² : 0.6299917085381317


In [9]:
median_house_value = df_housing_valid[['median_house_value']]
median_income = fix_null_values(df_housing_valid[['median_income']])
ocean_proximity_encoded = pd.get_dummies(df_housing_valid['ocean_proximity'], prefix='ocean')
rooms_per_habitant = fix_null_values(df_housing_valid['total_rooms']) / fix_null_values(df_housing_valid['population'])
bedrooms_per_households = fix_null_values(df_housing_valid['total_bedrooms']) / fix_null_values(df_housing_valid['households'])
longitude = df_housing_valid['longitude']
latitude = df_housing_valid['latitude']
households_per_habitant = fix_null_values(df_housing_valid['population']) / fix_null_values(df_housing_valid['households'])
total_rooms = fix_null_values(df_housing_valid['total_rooms'])
total_bedrooms = fix_null_values(df_housing_valid['total_bedrooms'])

In [10]:
X = np.column_stack((median_income, rooms_per_habitant, 
                     ocean_proximity_encoded, bedrooms_per_households, 
                     longitude, latitude, 
                     households_per_habitant, total_rooms,
                    total_bedrooms))
median_house_pred = model.predict(X)
print("Erreur quadratique moyenne (MSE) :", mean_squared_error(median_house_value, median_house_pred))
print("RMSE :", root_mean_squared_error(median_house_value, median_house_pred))
print("Score R² :", r2_score(median_house_value, median_house_pred))

Erreur quadratique moyenne (MSE) : 4754604789.6852
RMSE : 68953.64232355822
Score R² : 0.6457516431289145


In [11]:
median_income = fix_null_values(df_housing_test[['median_income']])
ocean_proximity_encoded = pd.get_dummies(df_housing_test['ocean_proximity'], prefix='ocean')
rooms_per_habitant = fix_null_values(df_housing_test['total_rooms']) / fix_null_values(df_housing_test['population'])
bedrooms_per_households = fix_null_values(df_housing_test['total_bedrooms']) / fix_null_values(df_housing_test['households'])
longitude = df_housing_test['longitude']
latitude = df_housing_test['latitude']
households_per_habitant = fix_null_values(df_housing_test['population']) / fix_null_values(df_housing_test['households'])
total_rooms = fix_null_values(df_housing_test['total_rooms'])
total_bedrooms = fix_null_values(df_housing_test['total_bedrooms'])

X = np.column_stack((median_income, rooms_per_habitant, 
                     ocean_proximity_encoded, bedrooms_per_households, 
                     longitude, latitude, 
                     households_per_habitant, total_rooms,
                    total_bedrooms))

predictions = model.predict(X)

df_submission = pd.read_csv("../ynov-data/submission.csv")
df_submission['median_house_value'] = predictions
df_submission.to_csv('../ynov-data/submission.csv', index=False)