### import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

### 1. datasets from pydataset

In [2]:
from pydataset import data

In [3]:
dataset_name = data()

In [4]:
dataset_name

Unnamed: 0,dataset_id,title
0,AirPassengers,Monthly Airline Passenger Numbers 1949-1960
1,BJsales,Sales Data with Leading Indicator
2,BOD,Biochemical Oxygen Demand
3,Formaldehyde,Determination of Formaldehyde
4,HairEyeColor,Hair and Eye Color of Statistics Students
...,...,...
752,VerbAgg,Verbal Aggression item responses
753,cake,Breakage Angle of Chocolate Cakes
754,cbpp,Contagious bovine pleuropneumonia
755,grouseticks,Data on red grouse ticks from Elston et al. 2001


In [5]:
dataset_name.iloc[4]

dataset_id                                 HairEyeColor
title         Hair and Eye Color of Statistics Students
Name: 4, dtype: object

In [6]:
hair_eye_color = data("HairEyeColor")

In [7]:
hair_eye_color

Unnamed: 0,Hair,Eye,Sex,Freq
1,Black,Brown,Male,32
2,Brown,Brown,Male,53
3,Red,Brown,Male,10
4,Blond,Brown,Male,3
5,Black,Blue,Male,11
6,Brown,Blue,Male,50
7,Red,Blue,Male,10
8,Blond,Blue,Male,30
9,Black,Hazel,Male,10
10,Brown,Hazel,Male,25


In [8]:
hair_eye_color.shape

(32, 4)

In [9]:
hair_eye_color.head()

Unnamed: 0,Hair,Eye,Sex,Freq
1,Black,Brown,Male,32
2,Brown,Brown,Male,53
3,Red,Brown,Male,10
4,Blond,Brown,Male,3
5,Black,Blue,Male,11


#### on hot encoding

In [10]:
sex_dummy = pd.get_dummies(hair_eye_color["Sex"], drop_first = True)

In [11]:
sex_dummy

Unnamed: 0,Male
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1
10,1


In [12]:
hair_eye_color.Eye.unique()

array(['Brown', 'Blue', 'Hazel', 'Green'], dtype=object)

In [13]:
hair_eye_color.Hair.unique()

array(['Black', 'Brown', 'Red', 'Blond'], dtype=object)

In [14]:
hair_eye_color.drop(columns = ["Sex"], inplace = True)

In [15]:
hair_eye_color.head()

Unnamed: 0,Hair,Eye,Freq
1,Black,Brown,32
2,Brown,Brown,53
3,Red,Brown,10
4,Blond,Brown,3
5,Black,Blue,11


In [16]:
data = hair_eye_color.join(sex_dummy)

In [17]:
data.head()

Unnamed: 0,Hair,Eye,Freq,Male
1,Black,Brown,32,1
2,Brown,Brown,53,1
3,Red,Brown,10,1
4,Blond,Brown,3,1
5,Black,Blue,11,1


In [18]:
column_replace = data.pop("Male")

In [19]:
data.insert(2, "Sex", column_replace)

In [20]:
data.head()

Unnamed: 0,Hair,Eye,Sex,Freq
1,Black,Brown,1,32
2,Brown,Brown,1,53
3,Red,Brown,1,10
4,Blond,Brown,1,3
5,Black,Blue,1,11


In [21]:
data.isnull().mean() * 100

Hair    0.0
Eye     0.0
Sex     0.0
Freq    0.0
dtype: float64

### 2. datasets from seaborn library

In [22]:
datasets = sns.get_dataset_names()

In [23]:
datasets

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [24]:
data = sns.load_dataset("iris")

In [26]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### 3. datasets from scikit-learn 

In [27]:
from sklearn.datasets import fetch_california_housing

In [28]:
housing = fetch_california_housing(as_frame = True)

In [29]:
housing

{'data':        MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
 0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
 1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
 2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
 3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
 4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
 ...       ...       ...       ...        ...         ...       ...       ...   
 20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
 20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
 20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
 20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
 20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   
 
        Longitude 

In [30]:
data = housing["data"].join(housing["target"])

In [31]:
data.shape

(20640, 9)

In [32]:
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [33]:
data.isnull().sum() * 100 / len(data)

MedInc         0.0
HouseAge       0.0
AveRooms       0.0
AveBedrms      0.0
Population     0.0
AveOccup       0.0
Latitude       0.0
Longitude      0.0
MedHouseVal    0.0
dtype: float64