# Pandas

Data manipulation and analysis. 

In [None]:
import pandas as pd

#create a database (from a dictionary)
#creating key-value pairs
dictionary = {"Restaurant Name":["HABITAT COFFEE SHOP","REILLY'S"],
              "Location":["Milan","Los Angeles"]}

print(dictionary)

### DataFrame
Pandas DataFrame is two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns)




In [None]:
ranrest = pd.DataFrame(dictionary)

In [None]:
ranrest

In [None]:
ranrest.index = ["HCS","RS"]
ranrest 

In [None]:
#not efficient!
#build importing from csv!

## Importing a database

In [None]:
import pandas as pd
df = pd.read_csv("randomrestaurants.csv", sep = ";",index_col = 0) #try to set: index_col = 0

#filepath_or_buffer : str, path object or file-like object
#   Any valid string path is acceptable. The string could be a URL.
#
#sep : str, default ‘,’
#   Delimiter to use.
#
#delimiter : str, default None
#   Alias for sep.
#
#header : int, list of int, default ‘infer’
#   Row number(s) to use as the column name
#   Default behavior is to infer the column names: 
#
#names : array-like, optional
#   List of column names to use.
#
#index_col : int, str, sequence of int / str, or False, default None
#   Column(s) to use as the row labels of the DataFrame

In [None]:
df

In [None]:
print(help(pd.read_csv))

In [None]:
df.tail(5)

In [None]:
df.sample(3) 

In [None]:
#a further example of importing a dataset
import pandas as pd
df2 = pd.read_csv("glass.csv")
#try to set: index_col=0
#df2.head(2)

'''
RI: refractive index
Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)
Mg: Magnesium
Al: Aluminum
Si: Silicon
K: Potassium
Ca: Calcium
Ba: Barium
Fe: Iron
Type of glass: (class attribute) 
'''

df2.head(10)

In [None]:
df2

## Index and select data

three options:
1. using square brackets
2. loc
3. iloc

In [None]:
#square brackets
import pandas as pd
restaurants = pd.read_csv("randomrestaurants.csv", sep = ";",index_col=0)

restaurants

In [None]:
restaurants[["Location"]] #panda series (1d labelled array)


In [None]:
type(restaurants[["Location"]])

In [None]:
restaurants[["Location","Score"]]# data frame
#type(restaurants[["Location"]])

In [None]:
restaurants[["Location","Score"]] #select columns

In [None]:
#select rows
restaurants[1:4] # three rows in position [1,2,3]

In [None]:
#WHAT IF i want to select columns and rows as 2D numpy array

#you can use:
#1) loc (based on labels)
#2) iloc (position based)

restaurants.loc["RS"]#as pandas series

In [None]:
restaurants.loc[["RS"]]

In [None]:
restaurants.loc[["HCS","RS","LPQ"]]

In [None]:
#extend selecting columns
restaurants.loc[["HCS","RS","LPQ"],["Location","Score"]]

In [None]:
restaurants.loc[:,["Location","Score"]] #all rows

In [None]:
#iloc based on positions
restaurants.iloc[[1]]

In [None]:
restaurants.iloc[[1,2,3]]

In [None]:
restaurants.iloc[[1,2,3],[1,2]]

In [None]:
restaurants.iloc[:,[1,2]]

In [None]:
restaurants.iloc[:5,[0,1,2]]  

In [None]:
#IMPORTING as NUMPY (only with numerical datasets)
import numpy as np
filename = "glass.csv"
data = np.loadtxt(filename, delimiter=",", skiprows=1)#usecols=[0,2]
data

## Looping on DataFrames

In [None]:
restaurants

In [None]:
restaurants

In [None]:
for i in restaurants:
    print(i) #!!! just the columns names

In [None]:
for label, row in restaurants.iterrows():
    print(label,":",row)


In [None]:
#if I just want the ID : the  full name of the restaurant
for label, row in restaurants.iterrows():
    print(label+": "+row["Restaurant"]+" "+str(row["Score"]) )

In [None]:
#if i want to add a column while iterating
for label, row in restaurants.iterrows():
    restaurants.loc[label,"name_len"] = int(len(row["Restaurant"]))

restaurants

In [None]:
#or use a more efficient apply **instead of a for loop**
restaurants["name_len2"] = restaurants["Restaurant"].apply(len)

restaurants

In [None]:
restaurants["Half Score"]=restaurants["Score"]/2

restaurants

In [None]:
#using a custom function


restaurants["class"] = restaurants["Score"].apply(lambda x: "A" if (x>95) else "B")

restaurants

In [None]:
restaurants["ratio"]= restaurants["Score"]/restaurants["name_len"]
restaurants

## MERGING

In [None]:
# First Dataset
restaurants = pd.read_csv("randomrestaurants.csv", sep = ";",index_col=0)
restaurants

In [None]:
# Second Dataset
restaurants2 = pd.read_csv("randomrestaurants_2.csv", sep = ";",index_col=0) # Load a different source
restaurants2

In [None]:
#  We merge the two data frames by common index
pd.merge(restaurants,restaurants2 ,left_index=True,right_index=True)  

In [None]:
 # ..but we can do it by a different column
pd.merge(restaurants,restaurants2,left_on="Restaurant",right_on="Restaurant" )

In [None]:
 # ..or multiple columns
pd.merge(restaurants,restaurants2, left_on=["Restaurant"],right_on=["Restaurant"] )

In [None]:
# we can prioritize one of the df to be merged
pd.merge(restaurants,restaurants2,left_on="Restaurant",right_on="Restaurant", how='left' ) 
# Let's try how ='outer', how= 'left', how = 'right' 


![Join](join.jpg)

## Try by yourself

- Import the dataset iris as a DataFrame
- Add the columns names (sepal length, sepal width, petal length, petal width, type).
- Create a new column *ratio_length* that  contains the ratio between the sepal and petal length.
- Add  a new column  named target  with value 1  if the type is setosa and 0 otherwise.