[Reference](https://levelup.gitconnected.com/introducing-pandas-e0581a6683c9)

# Installing Pandas

In [None]:
# jupyter cell
!pip install pandas

Terminal
pip install pandas

# Importing

In [None]:
import pandas as pd

# Reading Data Files

## CSV

In [None]:
df = pd.read_csv('file_path.csv', sep='separator character')
df = pd.read_csv('sales_202005.csv', sep=';')

## Excel

In [None]:
df = pd.read_excel('file_path.xlsx', sheet_name='')
df = pd.read_excel('sales_202005.xlsx', sheet_name='Jan')

# Show Data

## Head

In [None]:
df.head()

## T (Transposition)

In [None]:
df.T

## Dimensions

In [None]:
df.shape

## Information

In [None]:
df.info()

## Data Type

In [21]:
df.dtypes

In [22]:
df['Date'] = df['Date'].astype("datetime64")
df.dtypes

In [23]:
df['Date'] = pd.to_datetime(df['Date'])

## Descriptive Statistics

In [None]:
df.describe()

## Index

In [25]:
df.set_index('Date', inplace=True)

In [26]:
df.reset_index(drop=True, inplace=True)

## Unique values

In [27]:
df['itemDescription'].unique()[:5]

In [28]:
df.nunique()

## Sampling

In [29]:
df = df[df.AmountSpent < 300].sample(n=400, replace=True)
df.shape

## Datetime

In [30]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

In [38]:
df['Month'] = df['Date'].dt.month_name()

## Select rows and columns

In [31]:
df.loc[toselect,['OwnHome','Location','Children']]

In [33]:
df.iloc[:4,:3]

In [32]:
toselect = np.random.randint(100, size=7)
df.iloc[toselect, [2,4,6]]

## Replace values

In [37]:
month_names = {1:'January', 2:'February', 3:'March', 4:'April',
5: 'May', 6:'June', 7:'July', 8:'August', 9:'September',
10:'October', 11:'November', 12:'December'}
df.Month.replace(month_names, inplace=True)

## Filter

In [39]:
df.itemDescription.str.contains('milk').sum()
df.itemDescription.str.contains('whole milk').sum()

In [40]:
df[df.itemDescription.str.len() > 20]\
.itemDescription.unique()

# Working With Columns
## Add new columns

In [None]:
df['column_name'] = value
df['month_nm'] = df['date'].dt.month_name()

In [36]:
year = df['Date'].dt.year
month = df['Date'].dt.month
df.insert(1, 'Month', month)
df.insert(2, 'Year', year)

## Delete column

In [None]:
del df['column_name']

In [34]:
df.drop(['Year','Month'], axis=1, inplace=True)

## Filtering Data Frame

In [None]:
#OneCondition
df[ df['column_name' == 'XPTO' ]
   
#MultipleCondition
df[ (condition 1) & (condition 2) ...  ]

#Exemple
df[ (df['date'] >= '2020-05-01') & (df['date'] <= '2020-05-31') ]

## Tilde operator(~)

In [24]:
df[~df.Member_number.isin([3737, 2433, 3915, 2625])].shape

# Crosstab function

In [56]:
pd.crosstab(index=marketing.Age, columns=marketing.Gender, values=marketing.Salary, aggfunc='mean').round(1)

In [57]:
pd.crosstab(index=[marketing.Age, marketing.Married], columns=marketing.Gender,values=marketing.Salary, aggfunc='mean',
margins=True).round(1)

# Pivot or Group By
## Pivot

In [None]:
pd.pivot_table(df      #DataFrame Name
, index   = "day"      #Lines
, columns = "month_nm" #Columns
, values  = "price"    #Values
, aggfunc = "mean"     #Aggregation funtction
)

In [55]:
pd.pivot_table(data=marketing, index=['Age', 'Married'], columns='Gender', values='Salary', aggfunc='mean',
margins=True).round(1)

## Group By

In [None]:
df.groupby(['month_nm', 'day']).agg(
{  'price':   pd.Series.mean
, 'order_id': pd.Series.count
}
).reset_index()

# Visualization

## Plot

In [41]:
marketing.Salary.plot(kind='kde', title='Distribution of Salary', figsize=(10,6))

In [42]:
marketing.Salary.plot(kind='hist', title='Distribution of Salary',
figsize=(10,6))

In [44]:
groceries['month_name'] = groceries['Date'].dt.month_name()
groceries[['month_name','Date']].groupby('month_name')\
.count().plot(title="Monthly Sales", figsize=(10,6))

## BoxPlot

In [None]:
ax = df.boxplot(column=['price'])

## Bar

In [None]:
ax = df.plot.bar(x='month', y='price', figsize=(16,5), rot=0)

## Line

In [None]:
ax = df.plot.line(x='date', y='price', figsize=(16,5), marker='o', legend=['price'])ax.set_xlabel('Date')
ax.set_ylabel('Price')
ax.set_title('Day Over Day x Total Sales Price')
ax

## Pie

In [None]:
ax = df.plot.pie(x='month_nm', y='price', figsize=(8,8))

## Histogram

In [None]:
ax = df['price'].hist(figsize=(10,5))

## Lmplot — Seborn

In [None]:
#New DF
dfLR = pd.DataFrame(
 df.groupby(['day', 'month_nm', 'month'])
  .agg(
   {'price': pd.Series.mean}
   ).reset_index()
)#Chart
ax = sns.lmplot(
data  = dfLR       # DataFrame Name
, x   = "day"      # Line
, y   = "price"    # Column
, hue = "month_nm" # Points break (colors)
, col = "month"    # Charts break
)

# Splitting strings

In [54]:
groceries['month'] = groceries['Date']\
.str.split('-', expand=True)[1]

#  Splitting strings on character level

In [53]:
groceries['year'] = groceries['Date']\
.str.split('-', expand=True)[2].str[-2:]

# Sidetable

In [52]:
pip install sidetable
import sidetable
groceries.stb.freq(['itemDescription'], thresh=25)

# Correlation

In [45]:
df.corr()

# Handling missing values

In [51]:
groceries.isna().sum()

In [49]:
groceries.iloc[[1,10,30], [1,2]] = np.nan
groceries.isna().sum()

In [50]:
groceries['itemDescription']\
.fillna(value=groceries['itemDescription'].mode()[0], inplace=True)

groceries['Date'].fillna(method='ffill', inplace=True)

groceries.isna().sum()

# Selecting data types

In [48]:
marketing.select_dtypes(include='object').columns

marketing.select_dtypes(exclude='object').columns

# Creating dataframes

In [47]:
unique_items = groceries.itemDescription.unique()

prices = pd.DataFrame({
    'itemDescription': unique_items,
    'prices':np.random.randint(10, size=len(unique_items))
})

## Merging dataframes

In [46]:
merged_df = groceries.merge(prices, on='itemDescription')