## REGPLOT : Regression plot

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
diamonds = sns.load_dataset('diamonds')

sample = diamonds.sample(n=200)

sample.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
24011,2.08,Ideal,J,SI2,62.3,55.0,12179,8.23,8.17,5.11
27539,2.02,Ideal,G,SI2,62.0,55.0,18296,8.16,8.1,5.04
7331,0.91,Very Good,G,VS1,63.4,54.0,4209,6.24,6.07,3.9
49224,0.58,Very Good,E,VS1,62.9,54.0,2082,5.31,5.34,3.35
14580,1.13,Premium,H,VS2,60.6,58.0,5885,6.81,6.72,4.1


In [5]:
with plt.style.context('fivethirtyeight'):
    sns.regplot(data=sample, x='carat', y='price')
plt.show()

<img src='./plots/reg-plot-1.png'>

### fit_reg = false

In [7]:
with plt.style.context('fivethirtyeight'):
    sns.regplot(data=sample, x='carat', y='price', fit_reg=False)

plt.show()

<img src='./plots/reg-plot-2.png'>

### scatter = False

In [9]:
with plt.style.context('fivethirtyeight'):
    sns.regplot(data=sample, x='carat', y='price', scatter=False)

plt.show()

<img src='./plots/reg-plot-3.png'>

### turn off ci

In [11]:
with plt.style.context('fivethirtyeight'):
    sns.regplot(data=sample, x='carat', y='price', ci=False)

plt.show()

<img src='./plots/reg-plot-4.png'>

### Discrete variable

In [22]:
sample.cut.cat.codes

48317    2
3800     0
1776     3
18504    3
42554    1
        ..
4653     1
17117    3
32646    0
52929    2
3719     0
Length: 200, dtype: int8

In [13]:
temp = sample.copy()
temp.cut = sample.cut.cat.codes

In [15]:

with plt.style.context('fivethirtyeight'):
    sns.regplot(data=temp, x='cut', y='price',)

plt.show()

<img src='./plots/reg-plot-5.png'>

#### Improve the visual using jitter

In [17]:

with plt.style.context('fivethirtyeight'):
    sns.regplot(data=temp, x='cut', y='price', x_jitter=0.2)

plt.show()

<img src='./plots/reg-plot-6.png'>

## Estimate Agg | discrete var

* group all the discrete points
* calculate the agg [mean, median ...] and confidence



In [20]:
import numpy as np

with plt.style.context('fivethirtyeight'):
    sns.regplot(data=temp, x='cut', y='price', x_estimator=np.mean)

plt.show()

<img src='./plots/reg-plot-7.png'>

In [22]:
import numpy as np

with plt.style.context('ggplot'):
    sns.regplot(data=temp, x='cut', y='price', x_estimator=np.mean)

plt.show()

<img src='./plots/reg-plot-8.png'>

### poly reg | order=2

In [24]:
import numpy as np

with plt.style.context('fivethirtyeight'):
    sns.regplot(data=temp, x='carat', y='price', order=2)

plt.show()

<img src='./plots/reg-plot-9.png'>

In [28]:
import numpy as np
with plt.style.context('fivethirtyeight'):
    fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(15,6));
    ax[0].set(title="linear reg");
    ax[1].set(title="Robust reg");
    
    sns.regplot(data=temp, x='carat', y='price', ax=ax[0]);

    # robust reg
    sns.regplot(data=temp, x='carat', y='price', robust=True, ax=ax[1]);



    plt.tight_layout();
    plt.show();

<img src='./plots/reg-plot-10.png'>

### Modeling

* order -> polynomial reg
* robust -> robust reg
* logistic
* lowess


In [30]:
with plt.style.context('fivethirtyeight'):
    sns.regplot(data=sample, x='carat', y='price', lowess=True)

plt.show()

<img src='./plots/reg-plot-11.png'>

### Styling | marker

In [32]:
with plt.style.context('fivethirtyeight'):
    sns.regplot(data=sample, x='carat', y='price', marker='d')

plt.show()

<img src='./plots/reg-plot-12.png'>

### styling | scatter_kws

In [36]:
with plt.style.context('fivethirtyeight'):
    sns.regplot(data=sample, x='carat', y='price', marker='d', scatter_kws={'s':100})

plt.show()

<img src='./plots/reg-plot-13.png'>

In [38]:
with plt.style.context('fivethirtyeight'):
    sns.regplot(
        data=sample, x='carat', y='price', marker='o', 
        scatter_kws={'s':100, 'color':'seagreen', 'alpha':0.5})

plt.show()

<img src='./plots/reg-plot-14.png'>

### styling | line_kws

In [40]:
with plt.style.context('fivethirtyeight'):
    sns.regplot(
        data=sample, x='carat', y='price', marker='.', 
        scatter_kws={'s':100, 'alpha':0.8, 'color':'lightgray', 'edgecolor':'k'},
        line_kws={'color':'salmon', 'linewidth':2, 'linestyle':'-.'}
        )

plt.show()

In [53]:
sample.cut.value_counts()

Ideal        84
Very Good    52
Premium      47
Good         16
Fair          1
Name: cut, dtype: int64

In [45]:
classif = sample.query("cut in ['Ideal', 'Premium']")
classif.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
24011,2.08,Ideal,J,SI2,62.3,55.0,12179,8.23,8.17,5.11
27539,2.02,Ideal,G,SI2,62.0,55.0,18296,8.16,8.1,5.04
14580,1.13,Premium,H,VS2,60.6,58.0,5885,6.81,6.72,4.1
22494,2.0,Premium,I,SI2,62.3,57.0,10528,8.12,8.05,5.03
53677,0.71,Premium,E,VS2,59.0,59.0,2711,5.78,5.88,3.44


In [46]:
classif.cut = classif.cut.cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classif.cut = classif.cut.cat.codes


In [48]:
with plt.style.context('fivethirtyeight'):
    sns.regplot(data=classif, y='cut', x='price', logistic=True)

plt.show()

<img src='./plots/reg-plot-16.png'>

In [51]:
with plt.style.context('fivethirtyeight'):
    sns.ecdfplot(data=classif, x='price', hue='cut')

plt.show()

<img src='./plots/reg-plot-17.png'>

In [52]:
with plt.style.context('fivethirtyeight'):
    sns.histplot(data=classif, x='price', hue='cut', element='poly')

plt.show()

<img src='./plots/reg-plot-18.png'>