In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:

file_path = "./datasets/housing.csv"
df = pd.read_csv(file_path)

random_items = df.sample(n=10)
random_items

In [4]:
df.dropna(inplace=True) #Missing value correction\\ Because the number of defected rows is small we decided to just remove it but we could according to their categroical train data on each and other numerical columns
df[df.duplicated()]#check duplicated rows if exist. If it exists we can remove them

df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


(20433, 10)

In [None]:



plt.figure(figsize=(25,16))
aspect_ratio = 1.5 
sns.pairplot(data=df,hue='ocean_proximity',aspect=aspect_ratio)

plt.figure(figsize=(10, 6))
sns.boxplot(x='ocean_proximity', y='median_house_value', data=df)
plt.xlabel('Areas')
plt.ylabel('Median House Value')
plt.title("Median House Value-Ocean Proximity")
plt.show()

df['ocean_proximity'].value_counts(normalize=True)

grouped_1H_INLAND=df[df['ocean_proximity'].isin(["<1H OCEAN", "INLAND"])]

grouped_OCEAN_NEAR_BAY_INLAND=df[df['ocean_proximity'].isin(["NEAR OCEAN", "NEAR BAY"])]

plt.figure(figsize=(12,6))
sns.kdeplot(data=grouped_1H_INLAND,x="median_house_value",y="median_income",hue="ocean_proximity",fill=True)
plt.show()

plt.figure(figsize=(12,6))
sns.kdeplot(data=grouped_OCEAN_NEAR_BAY_INLAND,x="median_house_value",y="median_income",hue="ocean_proximity",fill=True)
plt.show()


plt.figure(figsize=(10, 6))
sns.histplot(x ="housing_median_age" , hue = "ocean_proximity",multiple = "dodge",data=grouped_1H_INLAND)
plt.xlabel('Areas')
plt.title("Median House Age Ocean Proximity")
plt.show()


plt.figure(figsize=(10, 6))
sns.histplot(x ="housing_median_age" , hue = "ocean_proximity",multiple = "dodge",data=grouped_OCEAN_NEAR_BAY_INLAND)
plt.xlabel('Areas')
plt.title("Median House Age Ocean Proximity")
plt.show()


plt.figure(figsize=(12,8),dpi=300)
sns.set(style='dark')
plt.title('Spatial Distribution Median House Values')
norm =plt.Normalize(df['median_house_value'].min(),df['median_house_value'].max())
ax=sns.scatterplot(x='longitude', y='latitude', hue='median_house_value',palette='RdYlBu', data=df)
sm = plt.cm.ScalarMappable(cmap="RdYlBu", norm=norm)
sm.set_array([])
ax.get_legend().remove()  
ax.figure.colorbar(sm,ax=ax.axes)

In [None]:
for df_class in df.ocean_proximity.unique():
    for column_name in df.columns.values[:-1]:
        data=None
        outliers=None
        dff=df[df['ocean_proximity']==df_class]
        data =dff[column_name].values

        # Calculate Z-scores
        z_scores = (data - np.mean(data)) / np.std(data)

        # Define threshold for outliers
        threshold = 5

        # Find outliers
        outliers = np.where(np.abs(z_scores) > threshold)[0]
        print(outliers)
        print(data)

        # Remove outliers
        data_cleaned = np.delete(data, outliers)

        # Plot data points
        plt.scatter(dff.index, data, color='blue', label='All Data')
        plt.scatter(dff.index[outliers], data[outliers], color='red', label='Outliers')
        #plt.scatter(dff.index.drop(outliers), data_cleaned, color='green', label='Non-Outliers')

        plt.xlabel('Index')
        plt.ylabel('Data Value')
        plt.title('Outlier Removal and Visualization '+column_name+"category: "+df_class)
        plt.legend()
        plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , LabelEncoder

df['ocean_proximity_id'] = LabelEncoder().fit_transform(df['ocean_proximity'])
X = data.drop(['median_house_value','ocean_proximity'], axis=1)
Y = data['median_house_value']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.values)
Y_scaled = scaler.fit_transform(Y.values.reshape(-1, 1))


X_train, X_text, y_train, y_test = train_test_split(X_scaled, Y_scaled, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("Train set shape:", X_train.shape, y_train.shape)


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
import numpy as np
degrees = np.arange(8)
mse_scores = []

for degree in degrees:
    model = Pipeline(steps=[
        ('poly', PolynomialFeatures(degree=degree)),
        ('regressor', LinearRegression())
    ])
    scores = cross_val_score(model, X_scaled, Y_scaled, cv=5, scoring='neg_mean_squared_error')
    print(scores)
    mse_scores.append(-np.mean(scores))

plt.figure()
plt.plot(degrees, mse_scores, marker='o')
plt.title('Degree vs. MSE')
plt.xlabel('Degree')
plt.ylabel('MSE')
plt.show()

best_degree = degrees[np.argmin(mse_scores)]

# Step f: Train the final model with the best degree and report MSE and R2 on the test data
final_model = Pipeline(steps=[
    ('poly', PolynomialFeatures(degree=best_degree)),
    ('regressor', LinearRegression())
])

final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Final Model Performance:")
print("MSE:", mse)
print("R2 Score:", r2)

In [None]:
df['ocean_proximity_id'] = LabelEncoder().fit_transform(df['ocean_proximity'])
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['population_per_household'] = df['population'] / df['households']

X = df.drop(columns=['median_house_value','ocean_proximity','total_rooms','population','households'] )
Y = df['median_house_value']

scaler_x = StandardScaler()
scaler_y= StandardScaler()
X_scaled = scaler_x.fit_transform(X)
Y_scaled = scaler_y.fit_transform(Y.values.reshape(-1, 1))


X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, Y_scaled, test_size=0.2, random_state=42)


# Print the shapes of the resulting datasets
print("Train set shape:", X_train.shape, y_train.shape)


In [None]:
import numpy as np
degrees = np.arange(8)
mse_scores = []

for degree in degrees:
    model = Pipeline(steps=[
        ('poly', PolynomialFeatures(degree=degree)),
        ('regressor', LinearRegression())
    ])
    scores = cross_val_score(model, X_scaled, Y_scaled, cv=5, scoring='neg_mean_squared_error')
    print(scores)
    mse_scores.append(-np.mean(scores))

plt.figure()
plt.plot(degrees, mse_scores, marker='o')
plt.title('Degree vs. MSE')
plt.xlabel('Degree')
plt.ylabel('MSE')
plt.show()

best_degree = degrees[np.argmin(mse_scores)]

# Step f: Train the final model with the best degree and report MSE and R2 on the test data
final_model = Pipeline(steps=[
    ('poly', PolynomialFeatures(degree=best_degree)),
    ('regressor', LinearRegression())
])

final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Final Model Performance:",best_degree)
print("MSE:", mse)
print("R2 Score:", r2)
