In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# EDA for the first time

### Literatúry
- [Pandas documentation](https://pandas.pydata.org/docs/)
- [Pandas tutorials](https://pandas.pydata.org/pandas-docs/version/0.15/tutorials.html)
- https://github.com/FIIT-IAU
- https://github.com/FIIT-IAU/IAU-2019-2020

### Dnes sa budeme venovať tomu, ako...
- načítať dáta
- analyzovať jednotlivé atribúty
- analyzovať vzťahy medzi atribútmi
- vizualizovať dáta (vhodné typy vizualizácií, vlastnosti dobrých vizualizácií, ako neklamať vizualizáciou)

### Predtým, ako začneme analyzovať dáta, by sme si mali ujasniť...
- Aké otázky máme analýzou zodpovedať
- Akú úlohu máme riešiť

# Country dataset - Reading from a file using Pandas

In [None]:
import matplotlib
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

In [None]:
filename = "data/countries.csv"
df = pd.read_csv(filename)
df.head()

**Reading the file again with the proper `decimal` for numbers for this file**

In [None]:
df = pd.read_csv(filename, decimal=',')
df.head()

In [None]:
df.info()

In [None]:
df.Region.unique()

**String cleaning - remove blank spaces**

In [None]:
df.Region = df.Region.str.strip()
df.Country = df.Country.str.strip()
df.Region.unique()

## Missing value?
**In how many row?**

In [None]:
df.shape[0] - df.dropna().shape[0]

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum().sum()

In [None]:
df[df.isnull().any(axis=1)]

## Visualization

In [None]:
df.info()

In [None]:
plt.rcParams["figure.figsize"] = (10,5)
sns.boxplot('Region', 
            'GDP ($ per capita)', 
            data=df)
pylab.xticks(rotation=90)

In [None]:
df.Region.value_counts().plot(kind='bar')

In [None]:
sns.pairplot(df.dropna()[['Pop. Density (per sq. mi.)', 
                          'GDP ($ per capita)', 
                          'Birthrate', 
                          'Net migration', 
                          'Literacy (%)', 
                          'Phones (per 1000)', 
                          'Deathrate']])

In [None]:
df['GDP ($ per capita)'].corr(df['Birthrate'])

In [None]:
sns.scatterplot('GDP ($ per capita)', 
                'Birthrate', 
                data=df)

In [None]:
df['Phones (per 1000)'].corr(df['Birthrate'])

In [None]:
sns.scatterplot('Phones (per 1000)', 
                'Birthrate', 
                data=df)

### Na tomto predmete sa budeme zaoberať len supervised ML (učenia s učiteľom)
- Regresie  𝑌∈𝑅
- Klasifikácie  𝑌∈{𝐶1,𝐶2,…,𝐶𝑁}

**Snažíme sa nájsť funkciu $f$ atribútov $X$, ktorá bude predikovať hodnotu závislej premennej $Y$**

# Obese dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

In [None]:
df = pd.read_csv("data/obese.csv")
df.info()

I don't like one column's name. It is too long, I'm going to shorten it

In [None]:
df.rename(columns={'Share of adults who are obese (%)' : 'obesity_rate'}, inplace=True)
df.Entity.unique()

**Visualization**

In [None]:
af = df[df.Entity == 'Africa']
sns.regplot(af.Year, af.obesity_rate)

usa = df[df.Entity == 'United States']
sns.regplot(usa.Year, usa.obesity_rate)

plt.legend(labels=['Afrika', 'United States'])

**Simple Y=f(X)**

In [None]:
slope, intercept, r_value, p_value, std_err = stats.linregress(af.Year, af.obesity_rate)
line = slope * af.Year + intercept
plt.plot(af.Year, af.obesity_rate, 'o', af.Year, line)

slope, intercept, r_value, p_value, std_err = stats.linregress(usa.Year, usa.obesity_rate)
line = slope * usa.Year + intercept
plt.plot(usa.Year, usa.obesity_rate, 'o', usa.Year, line)

In [None]:
x = 2300
y = slope * x + intercept
y

# Diamond dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

df = pd.read_csv('data/diamonds.csv')
df.describe()

In [None]:
df.color.value_counts()

In [None]:
df.color.value_counts().plot(kind='bar')

In [None]:
df.color.value_counts().plot(kind='pie')

**Your code:**

# Monitoring dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

filename = 'data/monitoring.csv'
df = pd.read_csv(filename)
df.describe()

In [None]:
%%bash
head -n 10 data/monitoring.csv

In [None]:
data = pd.read_csv(filename, 
                   sep='\t', 
                   header=None, 
                   na_values=[-999, -9999], 
                   index_col=0)
data.head()

**Your code:***