In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# EDA for the first time

### URLs
- [Pandas documentation](https://pandas.pydata.org/docs/)
- [Pandas tutorials](https://pandas.pydata.org/pandas-docs/version/0.15/tutorials.html)

### Training on ...
- how to read data
- how to analyze every atribute
- how to analyze relationships between atributes
- how to vizualize data (which kinds of visualizations and their characters, ...)

### EDA questions to answer by analyzing
- Which questions?
- What is the quest of our work?

# Country dataset - Reading from a file using Pandas

In [1]:
import matplotlib
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

In [2]:
filename = "data/countries.csv"
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,480,0,2306,16307,700.0,360,32,1213,22,8765,1,466,2034,38.0,24.0,38.0
1,Albania,EASTERN EUROPE,3581655,28748,1246,126,-493,2152,4500.0,865,712,2109,442,7449,3,1511,522,232.0,188.0,579.0
2,Algeria,NORTHERN AFRICA,32930091,2381740,138,4,-39,31,6000.0,700,781,322,25,9653,1,1714,461,101.0,6.0,298.0
3,American Samoa,OCEANIA,57794,199,2904,5829,-2071,927,8000.0,970,2595,10,15,75,2,2246,327,,,
4,Andorra,WESTERN EUROPE,71201,468,1521,0,66,405,19000.0,1000,4972,222,0,9778,3,871,625,,,


In [5]:
df.dtypes

Country                                object
Region                                 object
Population                              int64
Area (sq. mi.)                          int64
Pop. Density (per sq. mi.)             object
Coastline (coast/area ratio)           object
Net migration                          object
Infant mortality (per 1000 births)     object
GDP ($ per capita)                    float64
Literacy (%)                           object
Phones (per 1000)                      object
Arable (%)                             object
Crops (%)                              object
Other (%)                              object
Climate                                object
Birthrate                              object
Deathrate                              object
Agriculture                            object
Industry                               object
Service                                object
dtype: object

**Reading the file again with the proper `decimal` for numbers for this file**

In [None]:
df = pd.read_csv(filename, decimal=',')
df.head()

In [None]:
df.info()

In [None]:
df.Region.unique()

**String cleaning - remove blank spaces**

In [None]:
df.Region = df.Region.str.strip()
df.Country = df.Country.str.strip()
df.Region.unique()

## Missing value?
**In how many row?**

In [None]:
df.shape[0] - df.dropna().shape[0]

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum().sum()

In [None]:
df[df.isnull().any(axis=1)]

## Visualization

In [None]:
df.info()

In [None]:
plt.rcParams["figure.figsize"] = (10,5)
sns.boxplot(x='Region', 
            y='GDP ($ per capita)', 
            data=df)
pylab.xticks(rotation=90)

In [None]:
df.Region.value_counts().plot(kind='bar')

In [None]:
sns.pairplot(df.dropna()[['Pop. Density (per sq. mi.)', 
                          'GDP ($ per capita)', 
                          'Birthrate', 
                          'Net migration', 
                          'Literacy (%)', 
                          'Phones (per 1000)', 
                          'Deathrate']])

In [None]:
df['GDP ($ per capita)'].corr(df['Birthrate'])

In [None]:
sns.scatterplot(x='GDP ($ per capita)', 
                y='Birthrate', 
                data=df)

In [None]:
df['Phones (per 1000)'].corr(df['Birthrate'])

In [None]:
sns.scatterplot(x='Phones (per 1000)', 
                y='Birthrate', 
                data=df)

### In IAU, we will deal with supervised learning (učenia s učiteľom)
- Regresion  𝑌∈𝑅
- Classification  𝑌∈{𝐶1, 𝐶2,…, 𝐶𝑁}

# Obese dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

In [None]:
df = pd.read_csv("data/obese.csv")
df.info()

I don't like one column's name. It is too long, I'm going to shorten it

In [None]:
df.rename(columns={'Share of adults who are obese (%)' : 'obesity_rate'}, inplace=True)
df.Entity.unique()

**Visualization**

In [None]:
af = df[df.Entity == 'Africa']
sns.regplot(x=af.Year, y=af.obesity_rate)

usa = df[df.Entity == 'United States']
sns.regplot(x=usa.Year, y=usa.obesity_rate)

plt.legend(labels=['Afrika', 'United States'])

**Simple Y=f(X)**

In [None]:
slope, intercept, r_value, p_value, std_err = stats.linregress(af.Year, af.obesity_rate)
line = slope * af.Year + intercept
plt.plot(af.Year, af.obesity_rate, 'o', af.Year, line)

slope, intercept, r_value, p_value, std_err = stats.linregress(usa.Year, usa.obesity_rate)
line = slope * usa.Year + intercept
plt.plot(usa.Year, usa.obesity_rate, 'o', usa.Year, line)

In [None]:
x = 2300
y = slope * x + intercept
y

# Diamond dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

df = pd.read_csv('data/diamonds.csv')
df.describe()

In [None]:
df.color.value_counts()

In [None]:
df.color.value_counts().plot(kind='bar')

In [None]:
df.color.value_counts().plot(kind='pie')

**Your code:**

# Monitoring dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

filename = 'data/monitoring.csv'
df = pd.read_csv(filename)
df.describe()

In [None]:
%%bash
head -n 10 data/monitoring.csv

In [None]:
data = pd.read_csv(filename, 
                   sep='\t', 
                   header=None, 
                   na_values=[-999, -9999], 
                   index_col=0)
data.head()

**Your code:***