# Housing Problem Set

In [34]:
# pip install seaborn
# pip install matplotlib
# pip install plotly

In [35]:
import pandas as pd
import seaborn as sns

In [None]:
# Load the dataset from a CSV file named 'dataset.csv' into a DataFrame called 'housing_df'.
housing_df = pd.read_csv("dataset.csv")
housing_df



In [None]:
housing_df['SalePrice']

# Histogram

In [None]:
# Create a histogram of the 'SalePrice' column from the housing_df DataFrame.
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.hist(housing_df['SalePrice'], bins=30, color='blue', edgecolor='black')
plt.title('Distribution of House Sale Prices')
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


## Bar Chart


In [None]:
# How to create a bar chart of the average 'SalePrice' for each 'Neighborhood' in the housing_df DataFrame?
average_prices = housing_df.groupby('Neighborhood')['SalePrice'].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 8))
average_prices.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Average Sale Price by Neighborhood')
plt.xlabel('Neighborhood')
plt.ylabel('Average Sale Price')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()


## Pie Chart

In [None]:
# How to create a pie chart showing the proportion of houses in each 'Neighborhood' in the housing_df DataFrame?

# Combine smaller categories into 'Other' to reduce the number of categories
neighborhood_counts = housing_df['Neighborhood'].value_counts()
top_categories = neighborhood_counts.nlargest(4)  # Select the top 4 categories
other_count = neighborhood_counts.sum() - top_categories.sum()  # Calculate the sum of the remaining categories
neighborhood_counts_reduced = pd.concat([top_categories, pd.Series({'Other': other_count})])

plt.figure(figsize=(10, 7))
neighborhood_counts_reduced.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors)
plt.title('Proportion of Houses in Each Neighborhood (Top 4 + Other)')
plt.ylabel('')  # Hide the y-label
plt.show()


## Box Plot

In [None]:

# How to create a boxplot to visualize the distribution of 'SalePrice' in the housing_df DataFrame?
plt.figure(figsize=(10, 6))
sns.boxplot(data=housing_df, y='SalePrice')
plt.title('Distribution of Sale Prices')
plt.ylabel('Sale Price')
plt.grid(axis='y')
plt.show()



## Scatter Plot


In [None]:
# How to create a scatter plot to visualize the relationship between 'GrLivArea' and 'SalePrice' in the housing_df DataFrame?
plt.figure(figsize=(10, 6))
sns.scatterplot(data=housing_df, x='GrLivArea', y='SalePrice')
plt.title('Scatter Plot of Sale Price vs. Ground Living Area')
plt.xlabel('Ground Living Area (sq ft)')
plt.ylabel('Sale Price')
plt.grid(True)
plt.show()



In [None]:

# How to create a scatter plot to visualize the relationship between 'GrLivArea' and 'SalePrice' in the housing_df DataFrame,
# with colors representing fewer 'Neighborhood' categories?
# Filter the DataFrame to include only the top 4 neighborhoods by count
top_neighborhoods = housing_df['Neighborhood'].value_counts().nlargest(4).index
filtered_df = housing_df[housing_df['Neighborhood'].isin(top_neighborhoods)]

plt.figure(figsize=(10, 6))
sns.scatterplot(data=filtered_df, x='GrLivArea', y='SalePrice', hue='Neighborhood', palette='tab10')
plt.title('Scatter Plot of Sale Price vs. Ground Living Area by Top Neighborhoods')
plt.xlabel('Ground Living Area (sq ft)')
plt.ylabel('Sale Price')
plt.legend(title='Neighborhood', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()



In [None]:

# Dynamic Bubble Chart: Create an interactive bubble chart to visualize the relationship between 'GrLivArea', 'SalePrice', and 'OverallQual' in the housing_df DataFrame.
# The size of the bubbles should represent 'OverallQual'. Add dynamic interaction for 'YrSold'.

import plotly.express as px

# Filter the DataFrame to include only the top 4 neighborhoods by average SalePrice
top_neighborhoods = housing_df.groupby('Neighborhood')['SalePrice'].mean().nlargest(4).index
filtered_df = housing_df[housing_df['Neighborhood'].isin(top_neighborhoods)]

# Sort the DataFrame by 'YrSold' in ascending order
filtered_df = filtered_df.sort_values(by='YrSold')

# Create the bubble chart with reduced scale and dynamic interaction for 'YrSold'
fig = px.scatter(filtered_df, x='GrLivArea', y='SalePrice', 
                 size='OverallQual', color='Neighborhood',
                 hover_name='Neighborhood', size_max=30,  # Reduced size_max from 60 to 30
                 animation_frame='YrSold',  # Add dynamic interaction for 'YrSold'
                 title='Dynamic Bubble Chart: GrLivArea vs SalePrice with OverallQual as Bubble Size and Year Sold')

# Show the interactive plot
fig.show()

## Box Plot

In [None]:
# How can you create a box plot to visualize the distribution of 'SalePrice' across different 'Neighborhood' categories?
# Calculate the median SalePrice for each Neighborhood
median_saleprice = housing_df.groupby('Neighborhood')['SalePrice'].median().sort_values(ascending=False)

# Create a box plot sorted by the median SalePrice in descending order
plt.figure(figsize=(12, 8))
sns.boxplot(data=housing_df, x='Neighborhood', y='SalePrice', order=median_saleprice.index)
plt.title('Box Plot of Sale Price by Neighborhood (Sorted by Median in Descending Order)')
plt.xlabel('Neighborhood')
plt.ylabel('Sale Price')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


## Grouped Bar Chart

In [None]:

# How can you create a grouped bar chart to compare the average 'SalePrice' across fewer 'Neighborhood' categories for each 'BldgType'?
# Filter the DataFrame to include only the top 3 neighborhoods by count
top_neighborhoods = housing_df['Neighborhood'].value_counts().nlargest(3).index
filtered_df = housing_df[housing_df['Neighborhood'].isin(top_neighborhoods)]

# Calculate the average SalePrice for each combination of Neighborhood and BldgType
avg_saleprice = filtered_df.groupby(['Neighborhood', 'BldgType'])['SalePrice'].mean().unstack()

# Create a grouped bar chart
avg_saleprice.plot(kind='bar', figsize=(14, 7))
plt.title('Average Sale Price by Top Neighborhoods and Building Type')
plt.xlabel('Neighborhood')
plt.ylabel('Average Sale Price')
plt.xticks(rotation=45)
plt.legend(title='Building Type')
plt.grid(True)
plt.show()




## Line Chart

In [None]:

# Example: Create a line chart to visualize the average 'SalePrice' over time
plt.figure(figsize=(12, 6))
housing_df['YearBuilt'] = pd.to_datetime(housing_df['YearBuilt'], format='%Y')
avg_saleprice_by_year = housing_df.groupby(housing_df['YearBuilt'].dt.year)['SalePrice'].mean()
plt.plot(avg_saleprice_by_year.index, avg_saleprice_by_year.values, marker='o')
plt.title('Average Sale Price Over Time')
plt.xlabel('Year Built')
plt.ylabel('Average Sale Price')
plt.grid(True)
plt.show()


