In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
boston_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ST0151EN-SkillsNetwork/labs/boston_housing.csv'
boston_df = pd.read_csv(boston_url)

# 1. Boxplot for Median Value of Owner-Occupied Homes (MEDV)
plt.figure(figsize=(8, 6))
sns.boxplot(y=boston_df['MEDV'])
plt.title('Boxplot for Median Value of Owner-Occupied Homes (MEDV)')
plt.ylabel('Median Value (in $1000s)')
plt.show()
# Explanation: This boxplot helps us identify any outliers in the distribution of house prices (MEDV). 
# It also shows the overall spread and central tendency of house prices.

# 2. Barplot for Charles River Variable (CHAS)
plt.figure(figsize=(8, 6))
sns.countplot(x=boston_df['CHAS'])
plt.title('Homes near Charles River (CHAS)')
plt.xlabel('CHAS (1: near river, 0: otherwise)')
plt.ylabel('Count')
plt.show()
# Explanation: The barplot indicates the number of homes near the Charles River, which can be an important factor for pricing.

# 3. Boxplot for MEDV vs AGE
plt.figure(figsize=(8, 6))
sns.boxplot(x=boston_df['AGE'], y=boston_df['MEDV'])
plt.title('MEDV vs. Age of Buildings')
plt.xlabel('Proportion of Buildings built before 1940 (AGE)')
plt.ylabel('Median Value of Homes (MEDV)')
plt.show()
# Explanation: This shows how older homes (built before 1940) compare in price (MEDV) to newer homes.

# 4. Scatter Plot for NOX vs INDUS
plt.figure(figsize=(8, 6))
sns.scatterplot(x=boston_df['NOX'], y=boston_df['INDUS'])
plt.title('NOX vs Proportion of Non-Retail Business Acres per Town (INDUS)')
plt.xlabel('Nitric Oxide Concentrations (NOX)')
plt.ylabel('Proportion of Non-Retail Business Acres (INDUS)')
plt.show()
# Explanation: This scatterplot shows how industrial activity correlates with pollution levels (NOX) in Boston.

# 5. Histogram for Pupil-to-Teacher Ratio (PTRATIO)
plt.figure(figsize=(8, 6))
sns.histplot(boston_df['PTRATIO'], bins=30, kde=True)
plt.title('Distribution of Pupil-to-Teacher Ratio (PTRATIO)')
plt.xlabel('Pupil-to-Teacher Ratio')
plt.ylabel('Frequency')
plt.show()
# Explanation: This histogram shows the distribution of the pupil-to-teacher ratio across different areas in Boston.

# Regression: Rooms (RM) vs House Price (MEDV)
X = boston_df[['RM']]
y = boston_df['MEDV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Mean Squared Error and R-Squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-Squared: {r2}')

# 6. Plot the Regression Line
plt.figure(figsize=(8, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual Prices')
plt.plot(X_test, y_pred, color='red', label='Predicted Prices')
plt.title('Rooms vs. House Price (Regression Line)')
plt.xlabel('Average Number of Rooms (RM)')
plt.ylabel('Median House Price (MEDV)')
plt.legend()
plt.show()
# Explanation: The regression line shows the positive relationship between the number of rooms and house price. 
# As the number of rooms increases, the house price tends to increase as well.

# 7. Coefficient for DIS (Weighted Distance to Employment Centers)
X_DIS = boston_df[['DIS']]
y_MEDV = boston_df['MEDV']
model_dis = LinearRegression()
model_dis.fit(X_DIS, y_MEDV)
print(f'Coefficient for DIS (Weighted distance to employment centers): {model_dis.coef_[0]}')
# Explanation: The coefficient shows the relationship between the distance to employment centers and house prices. 
# A positive coefficient means homes farther away from employment centers may have higher prices, while a negative coefficient indicates the opposite.

# 8. Null and Alternative Hypothesis
# Null Hypothesis (H₀): There is no significant relationship between the number of rooms and house prices.
# Alternative Hypothesis (H₁): There is a significant relationship between the number of rooms and house prices.

# Conclusion: The regression results support the alternative hypothesis, as there is a positive relationship between the number of rooms and house price.
