In [9]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import sys
sys.path.append('python-dateutil>=2.7->matplotlib')

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler

# Load the California Housing dataset
data = fetch_california_housing()

# Convert to a pandas DataFrame
df = pd.DataFrame(data['data'], columns=data['feature_names'])
df['Target'] = data['target']


ModuleNotFoundError: No module named 'matplotlib'

In [3]:
# Explore the Data
# Statistical summary
print(df.describe())

# Preview the data
print(df.head())

# Get metadata
print(df.info())

# Preprocess the Data
# Check for missing data
print(df.isnull().sum())

# Normalize continuous variables
scaler = MinMaxScaler()
df[data.feature_names] = scaler.fit_transform(df[data.feature_names])

# Categorical Variables
# Simulate a categorical variable by categorizing the ‘HouseAge’ column
bins = [0, 10, 30, float('inf')]
labels = ['New', 'Old', 'Very Old']
df['HouseAgeCategory'] = pd.cut(df['HouseAge'], bins=bins, labels=labels)

# Apply one-hot encoding
df = pd.get_dummies(df, columns=['HouseAgeCategory'])

# Correlations
# Create a correlation matrix
correlation_matrix = df.corr()

# Visualize the correlation matrix
plt.matshow(correlation_matrix)
plt.title('Correlation Matrix')
plt.colorbar()
plt.show()

# Feature Engineering
# Create a new feature derived from existing columns
df['MedInc_AveRooms'] = df['MedInc'] * df['AveRooms']

# Part II - Analyze the relationship between Property Age and Price
# Boolean Indexing for Age Groups
# New: Houses aged 10 years or less.
# Old: Houses aged between 11 and 30 years.
# Very Old: Houses aged more than 30 years.
new_houses = df[df['HouseAge'] <= 10]
old_houses = df[(df['HouseAge'] > 10) & (df['HouseAge'] <= 30)]
very_old_houses = df[df['HouseAge'] > 30]

# Plotting Histograms
# Use matplotlib to plot a histogram that compares property age with its median value
plt.hist([new_houses['Target'], old_houses['Target'], very_old_houses['Target']],
         bins=20, alpha=0.5, label=['New Houses', 'Old Houses', 'Very Old Houses'])
plt.legend()
plt.xlabel('Median House Value')
plt.ylabel('Frequency')
plt.title('Property Age vs. Median House Value')
plt.show()

# Part III - Analyze the Neighborhood Crime Rate Impact on Prices
# Creating Binned Income
# Divide median income into bins
df['MedIncBins'] = pd.cut(df['MedInc'], bins=5)

# Grouping by Binned Income
# Group by the binned income and find the average Target
grouped_data = df.groupby('MedIncBins')['Target'].mean()

# Plotting Line Plot
# Visualize the grouped data with a line plot
grouped_data.plot(kind='line', marker='o')
plt.xlabel('Median Income Bins')
plt.ylabel('Average House Value')
plt.title('Neighborhood Income Impact on House Prices')
plt.show()


NameError: name 'df' is not defined