# Data Visualization

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

plt.rcParams['figure.figsize'] = [10, 8]

%matplotlib inline

## House Prices dataset

In [None]:
!gdown "1686zWPyz45ZY9hw-qpTfNKeHeNjGH2aV"

hp = pd.read_csv('house-prices.csv')
hp

### Bar plots

In [None]:
# You can find all the available color palettes at https://matplotlib.org/stable/tutorials/colors/colormaps.html
fig = sns.countplot(data=hp, x='Neighborhood', palette='Paired')
# plt.show()

In [None]:
sns.kdeplot?

In [None]:
# If you want to plot horizontally
sns.countplot(data=hp, y='Neighborhood', palette='Pastel1')
plt.show()

In [None]:
# Plot value counts grouped by a second variable
sns.countplot(data=hp, x='Neighborhood', hue='Brick')
plt.show()

### Analyze Distribution with Histogram and KDE

In [None]:
# Uni-variate with histogram
sns.displot(data=hp, x="Price", kind="hist", bins=10).set(title='House prices')
plt.show()

In [None]:
# Uni-variate with histogram
sns.displot(data=hp, x="Price", kind="hist", bins=50).set(
  title='House prices',
  xlabel='Price USD',
  ylabel='Samples count'
)
plt.show()

In [None]:
# KDE
sns.displot(data=hp, x="Price", kind="kde", fill=True).set(title='House prices')
plt.show()

In [None]:
# Histogram contitioned by other variable
sns.displot(data=hp, x="Price", hue="Neighborhood", kind="hist")
plt.show()

In [None]:
# KDE contitioned by other variable
sns.displot(data=hp, x="Price", hue="Neighborhood", kind='kde', fill=True)
plt.show()

### Box plot / Violin plot

In [None]:
sns.boxplot(data=hp, y='Price')
plt.show()

In [None]:
sns.boxplot(data=hp, y='SqFt')
plt.show()

In [None]:
sns.violinplot(data=hp, y='Price')
plt.show()

### Scatter Plot

In [None]:
sns.scatterplot(data=hp, x='Price', y='SqFt')
plt.show()

In [None]:
sns.scatterplot(data=hp, x='Price', y='SqFt', hue='Neighborhood')
#plt.show()

In [None]:
sns.scatterplot(data=hp, x='Price', y='Offers')
plt.show()

In [None]:
sns.scatterplot(data=hp, x='Bedrooms', y='Offers')
plt.show()

### Correlation and Heatmaps

In [None]:
cols = ['Price', 'SqFt', 'Bedrooms', 'Bathrooms', 'Offers']
corr = hp[cols].corr(method='pearson')
corr

In [None]:
sns.heatmap(corr)
plt.show()

In [None]:
sns.heatmap(corr, vmin=-1, vmax=1, cmap="coolwarm")
plt.show()

## Titanic Dataset

In [None]:
!gdown "1ZMqBMnK0ZsOT-pBoiB-QIpT6THVr27XH"

titanic = pd.read_csv('titanic.csv')
titanic

In [None]:
titanic.describe()

In [None]:
# Number of passengers by sex
sns.countplot(x='Sex', data=titanic)

In [None]:
# Survived
sns.countplot(x='Survived', data=titanic)

In [None]:
# We can also compare survived by gender and age
survived = titanic[titanic["Survived"] == 1]

plt.figure(figsize=(6, 24))
sns.countplot(y='Age', hue='Sex', data=survived)


In [None]:
plt.figure(figsize=(24, 6))
sns.countplot(x='Age', hue='Sex', data=survived)

**TODO:** Using count plot for a variable like Age, which has a lot of possible values seems not to be the best choice, can you suggest and code a better approach?


In [None]:
# Put you solution here
sns.displot(x='Age', kind='hist', hue='Sex', data=survived)

In [None]:
# Let's see from those who didn't survived
no_survived = titanic[titanic["Survived"] == 0]

plt.figure(figsize=(6, 24))
sns.countplot(y='Age', hue='Sex', data=no_survived)

In [None]:
# Let's see the Age distribution by Gender
male_ages = titanic[titanic['Sex'] == 'male']['Age'].dropna()
female_ages = titanic[titanic['Sex'] == 'female']['Age'].dropna()

sns.histplot(male_ages, label='male', kde=True, linewidth=0, color='green')
sns.histplot(female_ages, label='female', kde=True, stat="density", linewidth=0, color='orange')
plt.legend()

In [None]:
sns.kdeplot(male_ages, label='male')


In [None]:
# Another way to compare distributions with box plot
sns.boxplot(x='Sex', y='Age', data=titanic, palette='rainbow')

In [None]:
# Age distribution by Passenger class
sns.boxplot(x='Pclass', y='Age', data=titanic, palette='rainbow')


In [None]:
# Let's also compare the survived variable based on Passenger class and age
sns.boxplot(x="Pclass", y="Age", hue="Survived", data=titanic, palette='pastel')

In [None]:
# The same but by Age and Gender
sns.boxplot(x="Sex", y="Age", hue="Survived", data=titanic, palette='Pastel1')

In [None]:
sns.boxplot(y="Sex", x="Age", hue="Survived", data=titanic, palette='Pastel1')

In [None]:
sns.pairplot(titanic)