# Pandas Data frame

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI','Continent'])


In [None]:
df

In [None]:
df.index = ['Canada','France','Germany','Italy','Japan','United Kingdom','United States']

In [None]:
df

In [None]:
df.columns

In [None]:
df.index

In [None]:
df.info()

In [None]:
df.size

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.dtypes.value_counts()

# Indexing , Selection and Slicing

In [None]:
df

In [None]:
df.loc['Canada']

In [None]:
df.iloc[-1]

In [None]:
df['Population']

# Note that the index of the returned Series is the same as the DataFrame one. And its name is the name of the column . if youre working on a notebook and want to see a more DataFrame-like format you can use the to_frame method:

In [None]:

df['Population']. to_frame()

# Multiple column can also be selected similarly to numpy and series

In [None]:
df[['Population','GDP']]

# in this case , the result is another DataFrame . Slicing works differently , it acts at "row level ", and can be counter intuitive

In [None]:
df[1:3]

#Row level selection works works better with loc and iloc which are recommended over regular " direct slicing " (df[:]).loc selected row matching the given index

In [None]:
df.loc['Italy']

In [None]:
df.loc['France':'Italy']

# As a second "argument", you can pass the column(s) you'd like to select







In [None]:
df.loc['France': 'Italy' , 'Population']

In [None]:
df.loc['France': 'Italy' , ['Population', 'GDP']]

In [None]:
df.iloc[0]

In [None]:
df.iloc[[0,1,-1]]

In [None]:
df.iloc[1:3]

In [None]:
df.iloc[1:3]

In [None]:
df.iloc[1:3,3]

In [None]:
df.iloc[1:3,[0,3]]

In [None]:
df.iloc[1:3,1:3]

# RECOMMENDED : Always use loc and iloc to reduce ambiguity , specially with DataFrames with numeric indexes.

# Conditional selection (boolean arrays)
We saw conditional selection applied to Series and it'll work in the same way for DataFrames. After all, a DataFrame is a collection of Series

In [None]:
df

In [None]:
df['Population'] > 70


In [None]:
df.loc[df['Population'] > 70]

The boolean matching is done at Index level, so you can filter by any row, as long as it contains the right indexes. Column selection still works as expected

In [None]:
df.loc[df['Population'] > 70, 'Population']

In [None]:
df.loc[df['Population'] > 70, ['Population', 'GDP']]


#Dropping stuff
Opposed to the concept of selection, we have "dropping". Instead of pointing out which values you'd like to select you could point which ones you'd like to drop

In [None]:
df.drop('Canada')

In [None]:
df.drop(['Canada', 'Japan'])


In [None]:
df.drop(['Italy', 'Canada'], axis=0)

In [None]:
df.drop(['Population', 'HDI'], axis=1)

In [None]:
df.drop(['Population', 'HDI'], axis='columns')

In [None]:
df.drop(['Canada', 'Germany'], axis='rows')

All these drop methods return a new DataFrame. If you'd like to modify it "in place", you can use the inplace attribute (there's an example below).

#Operations

In [None]:
df[['Population', 'GDP']]

In [None]:
df[['Population', 'GDP']] / 100

Operations with Series work at a column level, broadcasting down the rows (which can be counter intuitive).

In [None]:
crisis = pd.Series([-1_000_000, -0.3], index=['GDP', 'HDI'])
crisis

In [None]:
df[['GDP', 'HDI']]

In [None]:
df[['GDP', 'HDI']] + crisis

#Modifying DataFrames

You can add columns, or replace values for columns without issues

##Adding a new column

In [None]:
langs = pd.Series(
    ['French', 'German', 'Italian'],
    index=['France', 'Germany', 'Italy'],
    name='Language'
)

In [None]:
langs

In [None]:
df['Language'] = langs

In [None]:
df

#Replacing values per column

In [None]:
df['Language'] = 'English'

In [None]:
df

#Renaming Columns

In [None]:
df.rename(
    columns={
        'HDI': 'Human Development Index',
        'Anual Popcorn Consumption': 'APC'
    }, index={
        'United States': 'USA',
        'United Kingdom': 'UK',
        'Argentina': 'AR'
    })

In [None]:
df.rename(index=str.upper)

In [None]:
df.rename(index=lambda x: x.lower())