In [1]:
import pandas as pd
import numpy as np

- indexing: come pescare i dati da un dataframe
- grouping: lavorare su sottogruppi di un dataframe -> GROUP BY
- joining: unire diversi dataframe -> JOIN 

### Indexing (per Series e per Dataframe)

In [3]:
np.random.seed(123)
df = pd.DataFrame(dict(
    u = np.round(np.random.rand(5), 2),
    v = np.round(np.random.randn(5), 2),
    w = ["spam", "bacon", "spam", "eggs", "sausage"],
    x = [True, False, True, False, True]
))

In [4]:
df

Unnamed: 0,u,v,w,x
0,0.7,0.32,spam,True
1,0.29,-0.05,bacon,False
2,0.23,-0.2,spam,True
3,0.55,1.98,eggs,False
4,0.72,-1.62,sausage,True


In [None]:
# loc -> label-based indexing
# iloc -> position-based indexing (da usare di meno)
# logical indexing -> utilizzo di un vettore di booleani per filtrare

In [6]:
type(df.u)

pandas.core.series.Series

In [8]:
df.u.loc[3]

0.55

In [13]:
df.u.iloc[3]

0.55

In [15]:
np.random.seed(123)
df2 = pd.DataFrame(dict(
    u = np.round(np.random.rand(5), 2),
    v = np.round(np.random.randn(5), 2),
    w = ["spam", "bacon", "beans", "eggs", "sausage"],
    x = [True, False, True, False, True]
)).set_index("w")

In [16]:
df2

Unnamed: 0_level_0,u,v,x
w,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
spam,0.7,0.32,True
bacon,0.29,-0.05,False
beans,0.23,-0.2,True
eggs,0.55,1.98,False
sausage,0.72,-1.62,True


In [12]:
df2.u.loc[3]

KeyError: 3

In [17]:
df2.u.iloc[3]

0.55

In [None]:
# Logical indexing

In [18]:
df.u.loc[df.u < 0.5]

1    0.29
2    0.23
Name: u, dtype: float64

In [19]:
df2.u.loc[df2.u < 0.5]

w
bacon    0.29
beans    0.23
Name: u, dtype: float64

In [22]:
df.u[df.u < 0.5]

1    0.29
2    0.23
Name: u, dtype: float64

In [26]:
### Indexing del dataframe
df.loc[0:3]

Unnamed: 0,u,v,w,x
0,0.7,0.32,spam,True
1,0.29,-0.05,bacon,False
2,0.23,-0.2,spam,True
3,0.55,1.98,eggs,False


In [27]:
df.iloc[0:3]

Unnamed: 0,u,v,w,x
0,0.7,0.32,spam,True
1,0.29,-0.05,bacon,False
2,0.23,-0.2,spam,True


In [28]:
df.loc[0:3, ["w", "x"]]

Unnamed: 0,w,x
0,spam,True
1,bacon,False
2,spam,True
3,eggs,False


In [31]:
df.iloc[0:4, -2:] # DA NON USARE PERCHE CI AFFIDIAMO ALLA POSIZIONE DELLE COLONNE

Unnamed: 0,w,x
0,spam,True
1,bacon,False
2,spam,True
3,eggs,False


In [32]:
df.iloc[0:4, ["w", "x"]]

IndexError: .iloc requires numeric indexers, got ['w' 'x']

In [None]:
df.iloc[0:4, :].loc[:, ["w", "x"]]

### Grouping

In [43]:
df.groupby("w")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1271d7b50>

In [44]:
g1 = df.groupby("w")

In [39]:
df.groupby("w").count().head()

Unnamed: 0_level_0,u,v,x
w,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bacon,1,1,1
eggs,1,1,1
sausage,1,1,1
spam,2,2,2


In [48]:
g1.groups.keys()

dict_keys(['bacon', 'eggs', 'sausage', 'spam'])

In [49]:
g1.get_group("bacon")

Unnamed: 0,u,v,w,x
1,0.29,-0.05,bacon,False


In [50]:
df.groupby("w").count().index

Index(['bacon', 'eggs', 'sausage', 'spam'], dtype='object', name='w')

In [51]:
pd.Series(g1.groups.keys())

0      bacon
1       eggs
2    sausage
3       spam
dtype: object

In [54]:
headers = ['name', 'title', 'department', 'salary']
chicago = pd.read_csv('./city-of-chicago-salaries.csv',
                      header=0,
                      names=headers,
                      converters={'salary': lambda x: float(x.replace('$', ''))})
chicago

Unnamed: 0,name,title,department,salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,85512.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,75372.0
2,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,80916.0
3,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,99648.0
4,"ABBATACOLA, ROBERT J",ELECTRICAL MECHANIC,AVIATION,89440.0
...,...,...,...,...
32049,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,96553.6
32050,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,78012.0
32051,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,78012.0
32052,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,80724.0


Cerchiamo gli highest paid employee per ogni dipartimento 
DA FINIRE
 SELECT *
    FROM chicago AS c
    INNER JOIN (
        SELECT department, max(salary) AS max_salary
        FROM chicago
        GROUP BY department
    ) AS m
    ON c.department = m.department
    AND c.salary = m.max_salary;
 
 
Aggiungiamo colonna di max salary per dipartimento al vecchio df
 SELECT *
FROM chicago AS c
INNER JOIN (
    SELECT department, max(salary) AS max_salary
    FROM chicago
    GROUP BY department
) AS m
ON c.department = m.department;

In [67]:
by_dept = chicago.groupby("department").agg(
        max_salary_by_dept=('salary', 'max')
)

In [68]:
by_dept.shape

(35, 1)

In [69]:
chicago.department.unique().shape

(35,)

In [70]:
by_dept

Unnamed: 0_level_0,max_salary_by_dept
department,Unnamed: 1_level_1
ADMIN HEARNG,156420.0
ANIMAL CONTRL,134124.0
AVIATION,186576.0
BOARD OF ELECTION,124320.0
BOARD OF ETHICS,125532.0
BUDGET & MGMT,169992.0
BUILDINGS,157092.0
BUSINESS AFFAIRS,157092.0
CITY CLERK,133545.0
CITY COUNCIL,160248.0


Like SQL's **JOIN** clause, `pandas.merge` allows two DataFrames to be joined on one or more keys. The function provides a series of parameters `(on, left_on, right_on, left_index, right_index)` allowing you to specify the columns or indexes on which to join.

By default, `pandas.merge` operates as an *inner join*, which can be changed using the `how` parameter.

From the function's docstring:

> how : {'left', 'right', 'outer', 'inner'}, default 'inner'

>    * left: use only keys from left frame (SQL: left outer join)

>    * right: use only keys from right frame (SQL: right outer join)

>    * outer: use union of keys from both frames (SQL: full outer join)

>    * inner: use intersection of keys from both frames (SQL: inner join)


In [None]:
pd.merge(left_frame, right_frame, on, how)

In [73]:
new_df = pd.merge(chicago, by_dept, on="department")

In [74]:
new_df

Unnamed: 0,name,title,department,salary,max_salary_by_dept
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,85512.0,169512.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,75372.0,260004.0
2,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,80916.0,157092.0
3,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,99648.0,169512.0
4,"ABBATACOLA, ROBERT J",ELECTRICAL MECHANIC,AVIATION,89440.0,186576.0
...,...,...,...,...,...
32049,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,96553.6,157092.0
32050,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,78012.0,260004.0
32051,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,78012.0,260004.0
32052,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,80724.0,260004.0


In [2]:
df = pd.read_csv('train.csv')

In [16]:
def f(x):
    x2 = x[x<30]
    return x2

In [31]:
def get_outliers(x):
    lower = x.quantile(0.25) - (x.quantile(0.75) - x.quantile(0.25))*1.5
    upper = x.quantile(0.75) + (x.quantile(0.75) - x.quantile(0.25))*1.5
    n_out = x[ (x<lower) | (x>upper) ].shape[0]
    return n_out

In [33]:
df[['Age']].apply(get_outliers)

Age    11
dtype: int64

In [22]:
df['Age'].mean()

29.69911764705882

In [32]:
get_outliers(df['Age'])

11

In [20]:
type(df[['Age']])

pandas.core.frame.DataFrame

In [21]:
type(df['Age'])

pandas.core.series.Series

In [15]:
df['Age'].quantile(0.3)

22.0

In [38]:
lista_var = df.dtypes[df.dtypes!='object'].index.to_list()

In [39]:
lista_var

['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [41]:
def find_outliers(x):
    lower = x.quantile(0.25) - (x.quantile(0.75) - x.quantile(0.25))*1.5
    upper = x.quantile(0.75) + (x.quantile(0.75) - x.quantile(0.25))*1.5
    n_out = x[ (x<lower) | (x>upper) ].shape[0]
    return n_out

df[lista_var].apply(find_outliers)

# expected outcome
#series con index dato dai nomi delle vars e valori dati dalla
# funzione find_outliers -> numero di outliers

PassengerId      0
Survived         0
Pclass           0
Age             11
SibSp           46
Parch          213
Fare           116
dtype: int64