In [2]:
#%%
# ============================================================
# Day 1 Assignment – Statistics for Data Science
# Python version – Dennis Fok, Erasmus University Rotterdam
# ============================================================

# In this exercise we will use three packages:
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats

#%% 
# ------------------------------------------------------------
# Exercise 1.1 – Data Loading and Exploration
# ------------------------------------------------------------

# (a) Load the data
houseprice = pd.read_csv("houseprice.csv")

In [3]:
# (b) Explore the dataset
display(houseprice)
list(houseprice)
houseprice.info()

Unnamed: 0.1,Unnamed: 0,airco,bathrms,bedrooms,driveway,fullbase,garagepl,gashw,lotsize,prefarea,price,recroom,stories
0,1,0,1,3,1,1,1,0,5850,0,42000,0,2
1,2,0,1,2,1,0,0,0,4000,0,38500,0,1
2,3,0,1,3,1,0,0,0,3060,0,49500,0,1
3,4,0,1,3,1,0,0,0,6650,0,60500,1,2
4,5,0,1,2,1,0,0,0,6360,0,61000,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,542,1,2,3,1,0,0,0,4800,0,91500,1,4
542,543,1,2,3,1,0,0,0,6000,0,94000,0,4
543,544,1,2,3,1,0,1,0,6000,0,103000,1,4
544,545,1,2,3,1,0,1,0,6000,0,105000,1,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546 entries, 0 to 545
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Unnamed: 0  546 non-null    int64
 1   airco       546 non-null    int64
 2   bathrms     546 non-null    int64
 3   bedrooms    546 non-null    int64
 4   driveway    546 non-null    int64
 5   fullbase    546 non-null    int64
 6   garagepl    546 non-null    int64
 7   gashw       546 non-null    int64
 8   lotsize     546 non-null    int64
 9   prefarea    546 non-null    int64
 10  price       546 non-null    int64
 11  recroom     546 non-null    int64
 12  stories     546 non-null    int64
dtypes: int64(13)
memory usage: 55.6 KB


In [None]:




# Questions:
# - What does a single observation represent?
# - Which variables are present?
# - What are the measurement units?
# - What are the variable types?

# (c) Dimensions of the dataset
houseprice.shape  # (rows, columns)

# (d) Basic plots and descriptive statistics
houseprice.plot()
pd.plotting.scatter_matrix(houseprice)
houseprice.describe()

# Note key points about ranges, scales, missing values, etc.

#%%
# ------------------------------------------------------------
# Exercise 1.2 – Univariate Summary Statistics
# ------------------------------------------------------------

# (a) Mean house price
houseprice.price.mean()

# (b) Median price & comparison for skewness
houseprice.price.median()

# (c) Cheapest and most expensive house
houseprice.price.min(), houseprice.price.max()

# (d) Range and standard deviation
price_range = houseprice.price.max() - houseprice.price.min()
price_std = houseprice.price.std()

price_range, price_std

# (e) Frequency tables for categorical variables
for col in ["airco", "driveway", "fullbase", "gashw", "prefarea", "recroom"]:
    print(f"\n{col}")
    print(houseprice[col].value_counts())
    print(houseprice[col].value_counts().sort_index())

# (f) Number of houses with 5 or 6 bedrooms
sum(houseprice.bedrooms == 6)
sum(houseprice.bedrooms == 5)

#%%
# ------------------------------------------------------------
# Exercise 1.3 – Univariate Plots
# ------------------------------------------------------------

# (a) Histogram of house prices
houseprice.price.hist(bins=20)
plt.title("Histogram of House Prices")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.show()

# (b) Bar plot of stories variable
houseprice.stories.hist()
plt.title("Histogram of Stories")
plt.xlabel("Stories")
plt.ylabel("Frequency")
plt.show()

houseprice.stories.value_counts().sort_index().plot.bar()
plt.title("Bar Plot of Stories (sorted by index)")
plt.xlabel("Stories")
plt.ylabel("Count")
plt.show()

# (c) Check normality of lotsize
houseprice.lotsize.plot.density()
plt.title("Density Plot of Lotsize")
plt.show()

stats.probplot(houseprice.lotsize, dist="norm", plot=plt)
plt.title("QQ-Plot of Lotsize")
plt.show()

#%%
# ------------------------------------------------------------
# Exercise 1.4 – Bivariate Relationships
# ------------------------------------------------------------

# (a) Scatterplot: price vs lotsize
houseprice.plot(x="lotsize", y="price", kind="scatter")
plt.title("Price vs Lot Size")
plt.show()

# (b) Cross-tab: bedrooms vs bathrooms
pd.crosstab(houseprice.bedrooms, houseprice.bathrooms)

# (c) Bedrooms vs preferred area
pd.crosstab(
    houseprice.bedrooms,
    houseprice.prefarea,
    margins=True,
    normalize="index"
)

# (d) Conditional boxplots: lotsize by garage places
houseprice.boxplot(column="lotsize", by="garagepl")
plt.title("Lot Size by Garage Places")
plt.suptitle("")  # Remove automatic pandas subtitle
plt.show()

#%%
# ------------------------------------------------------------
# Exercise 2.1 – Volkswagen Prices
# ------------------------------------------------------------

# Load data
vwgolf = pd.read_csv("vwgolf.csv")

# (b) Summaries
vwgolf.info()
vwgolf.describe()
vwgolf.isnull().sum()

# (c) Scatterplot: Mileage vs AskingPrice
vwgolf.plot(x="Mileage", y="AskingPrice", kind="scatter")
plt.title("Mileage vs Asking Price")
plt.show()

# (d) Scatterplot: Mileage vs (PriceNew - AskingPrice)
vwgolf["PriceDiff"] = vwgolf["PriceNew"] - vwgolf["AskingPrice"]
vwgolf.plot(x="Mileage", y="PriceDiff", kind="scatter")
plt.title("Mileage vs Price Difference (New - Asking)")
plt.show()

# (e) Histogram and density plot of Mileage
vwgolf.Mileage.hist(bins=20)
plt.title("Histogram of Mileage")
plt.show()

vwgolf.Mileage.plot.density()
plt.title("Density Plot of Mileage")
plt.show()

# (f) Boxplots of Mileage by Fuel type
vwgolf.boxplot(column="Mileage", by="Fuel")
plt.title("Mileage by Fuel Type")
plt.suptitle("")
plt.show()

# (g) Min, median, max number of owners
vwgolf["Owners"].min(), vwgolf["Owners"].median(), vwgolf["Owners"].max()

# (h) Quantiles of PriceNew - AskingPrice
vwgolf["PriceDiff"].quantile([0.25, 0.5, 0.75])

# (i) How many diesel cars have automatic transmission?
len(vwgolf[(vwgolf["Fuel"] == "Diesel") & (vwgolf["Transmission"] == "Automatic")])

# (j) Min and max of Mileage (ignores missing values)
vwgolf.Mileage.min(), vwgolf.Mileage.max()

#%%
# ------------------------------------------------------------
# Exercise 2.2 – Creative Questions
# ------------------------------------------------------------

# Feel free to explore relationships, trends, or outliers.
# Example ideas:
# - Correlation matrix between numeric variables
# - Distribution of AskingPrice by Transmission
# - Scatterplot of Age vs PriceDiff

# Example:
vwgolf.corr(numeric_only=True)

vwgolf.boxplot(column="AskingPrice", by="Transmission")
plt.title("Asking Price by Transmission Type")
plt.suptitle("")
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'houseprice.csv'