In [None]:
pip install plotly

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns #nice statics plots
import matplotlib.pyplot as plt #traditional plots
import plotly.express as px #dynamic plots
#introduce matplotlib while coding in python notebook
%matplotlib inline 

## Bivariate analysis

If we analyze data by taking two variables/columns into consideration from a dataset, it is known as Bivariate Analysis.

### Numeric-Numeric Analysis:
Analyzing the two numeric variables from a dataset is known as numeric-numeric analysis. 
We can analyze it in three different ways.
 - Scatter Plot
 - Pair Plot
 - Correlation Matrix

#### Scatter Plot

Let’s take three columns *Category*, *Alcohol* and *Mg* from our dataset and see what we can infer by plotting to scatter plot between *Alcohol* - *Category* and *Mg* - *Category*

In [None]:
# import a .csv file
data = pd.read_csv("C:/Users/Eugenio_Py/Desktop/Notebooks/datasets/wines.csv", sep = ";", header = 0, index_col = 0)

In [None]:
data

In [None]:
#plot the scatter plot of Category and Alcohol variable in data
plt.scatter(data["Alcohol"],data["Mg"])
plt.show()

In [None]:
#plot the scatter plot of balance and Mg variable in data
plt.scatter(data.Proline, data.Alcohol)
plt.show()

In [None]:
sns.scatterplot(data=data, x="Proline", y="Alcohol"); #try Ash instead of alcohol

In [None]:
sns.scatterplot(data=data, x="Proline", y="Alcohol", hue = "Category");

In [None]:
sns.displot(data, x="Proline", hue="Category", kind="kde", fill=True);

In [None]:
sns.displot(data, x="Alcohol", hue="Category", kind="kde", fill=True);

In [None]:
# using plotly
fig = px.scatter(data, x="Proline", y="Alcohol")
fig.show()

In [None]:
# Color by discrete
fig = px.scatter(data, x="Proline", y="Alcohol", color="Category")
fig.show()

In [None]:
# Bubble plot (using a certain size)
fig = px.scatter(data, x="Proline", y="Alcohol", color="Category",
                 size='Hue')
fig.show()

In [None]:
# Color by continuous
fig = px.scatter(data, x="Proline", y="Alcohol", color="Hue")
fig.show()

In [None]:
# Color by discrete and change symbols 
fig = px.scatter(data, x="Proline", y="Alcohol", color="Category", symbol = "Category")
fig.show()

In [None]:
# Hover with text
fig = px.scatter(data, x="Proline", y="Alcohol", color="Category", symbol = "Category", hover_name=data.index)
fig.show()

In [None]:
# Modifications
fig.update_yaxes( # the y-axis is %
    ticksuffix="%", showgrid=True
)

fig.update_layout( # customize font and legend orientation & position
    font_family="Sans",
    legend=dict(
        title="My wines", orientation="h", y=1, yanchor="bottom", x=0.5, xanchor="center"
    )
)
fig.show()

#### Density maps

In [None]:
fig = px.density_heatmap(data, x="Proline", y="Alcohol")
fig.show()

#### Pair Plot
Now, let’s plot Pair Plots for the three columns we used in plotting Scatter plots. 
We’ll use the seaborn and plotly libraries for plotting Pair Plots.

In [None]:
#plot the pair plot of Alcohol, Mg and Category in data dataframe.
sns.pairplot(data = data, vars=['Alcohol','Mg','Proline'])
plt.show();

In [None]:
data.loc[:,['Alcohol','Mg','Proline']]

In [None]:
import plotly.express as px
fig = px.scatter_matrix(data.loc[:,['Alcohol','Mg','Proline']])
fig.show()

In [None]:
fig = px.scatter_matrix(data,
    dimensions=['Alcohol','Mg','Proline',"Ash"],
    color="Category",hover_name=data.index)
fig.show()

In [None]:
#remove the diagonal
fig = px.scatter_matrix(data,
    dimensions=['Alcohol','Mg','Proline',"Ash"],
    color="Category",symbol="Category",hover_name=data.index,
    title="Scatter matrix of wines data set")
fig.update_traces(diagonal_visible=False)
fig.show()

#### Correlation Matrix
Since we cannot use more than two variables as x-axis and y-axis in Scatter and Pair Plots, it is difficult to see the relation between three numerical variables in a single graph. 
In those cases, we’ll use the correlation matrix.

In [None]:
# Correlation data matrix
data.corr()

In [None]:
data[['Alcohol','Mg','Proline']].corr()

In [None]:
# Creating a matrix using Alcohol, Mg, and Proline as rows and columns
data[['Alcohol','Mg','Proline']].corr()

#plot the correlation matrix of lcohol, Mg, and Proline in data dataframe.
sns.heatmap(data[['Alcohol','Mg','Proline','Phenols']].corr(), annot=True, cmap = 'Reds')
plt.show()

In [None]:
# Creating a matrix using Alcohol, Mg, and Proline as rows and columns
data[['Alcohol','Mg','Proline']].corr()

#plot the correlation matrix of Alcohol, Mg, and Proline in data dataframe.
sns.heatmap(data[['Alcohol','Mg','Proline','Phenols']].corr(), annot=True, cmap = 'Reds', vmin=-1, vmax=1)
plt.show()

In [None]:
# Creating a matrix using all the variables as rows and columns
data.corr()

#plot the correlation matrix of lcohol, Mg, and Proline in data dataframe.
sns.heatmap(data.corr(), annot=True, cmap = 'Reds')
plt.show()

In [None]:
# Creating a matrix using all the variables as rows and columns
data.corr()

#plot the correlation matrix of all the variables in data dataframe.
from matplotlib.pyplot import figure

figure(figsize=(8, 6), dpi=80) # try dpi = 120

sns.heatmap(data.corr(), annot=True, cmap = 'Reds')
plt.show()

### Numeric - Categorical Analysis

Analyzing the one numeric variable and one categorical variable from a dataset is known as numeric-categorical analysis. 

We analyze them mainly using mean, median, and boxplots.
Let’s take *Alcohol* and *Category* columns from our dataset.

First check for mean value using **groupby**

In [None]:
#groupby the response to find the mean of the alcohol with Category wines separately.
data.groupby('Category')['Alcohol'].mean()

There is not much of a difference between the wines based on the alcohol content.

Let’s calculate the median

In [None]:
#groupby the response to find the median of the Alcohol with Category separately.
data.groupby('Category')['Alcohol'].median()

By both mean and median we can say that the Category is different for some wines. 
Let’s plot the boxplot for them and check the behavior.

In [None]:
#plot the box plot of Alcohol for Category.
sns.boxplot(data = data, x = 'Category', y = 'Ash')
plt.show();

In [None]:
# using plotly
fig = px.box(data, x = 'Category', y = 'Ash', color = "Category")
fig.show()

As we can see, when we plot the Box Plot, it paints a very different picture compared to mean and median. 

The IQR for Grignolino has lower values than Barolo median and Barbera box.

This is how we analyze Numeric-Categorical variables, we use mean, median, and Box Plots to draw some sort of conclusions.

## Multivariate Analysis

Making a 3D plot

### 3D Scatter plot

In [None]:
import plotly.express as px
fig = px.scatter_3d(data, x='Alcohol', y='Mg', z='Proline',hover_name=data.index,
                    color='Category')
fig.show()