In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
#series
numbers = [3,4,5,6,7]
serie = pd.Series(numbers)
serie, type(serie)

(0    3
 1    4
 2    5
 3    6
 4    7
 dtype: int64,
 pandas.core.series.Series)

In [3]:
#dictionary
data = {
    "Name": ["Ana", "Juan", "Pedro"],
    "YearsOld": [25,23,21],
    "Citys": ["NY","LONDON", "New Zeland"]
}
data, type(data)

({'Name': ['Ana', 'Juan', 'Pedro'],
  'YearsOld': [25, 23, 21],
  'Citys': ['NY', 'LONDON', 'New Zeland']},
 dict)

In [4]:
#DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25,NY
1,Juan,23,LONDON
2,Pedro,21,New Zeland


In [5]:
#export df to csv
df.to_csv("data.csv")

In [6]:
#import csv, with index_col=0 don't create indice col
import_df = pd.read_csv("data.csv", index_col=0)
import_df

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25,NY
1,Juan,23,LONDON
2,Pedro,21,New Zeland


In [7]:
#how to select column?
names = df["Name"]
print(names)


0      Ana
1     Juan
2    Pedro
Name: Name, dtype: object


In [8]:
#how to select more than one column?
df[["Name", "YearsOld"]]

Unnamed: 0,Name,YearsOld
0,Ana,25
1,Juan,23
2,Pedro,21


In [9]:
#filter with rows
fil = df.loc[1]
fil

Name          Juan
YearsOld        23
Citys       LONDON
Name: 1, dtype: object

In [10]:
#filter with condition
df[df["YearsOld"] > 22]

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25,NY
1,Juan,23,LONDON


In [11]:
#filter with much conditions
filter = (df["YearsOld"] > 21) & (df["Name"].str.startswith("A"))
df[filter]

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25,NY


In [12]:
#Filter with sql querys
df.query("YearsOld > 22")

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25,NY
1,Juan,23,LONDON


In [13]:
#Extract rows by explicit data cols
df[df["Name"].isin(["Calors","Ana","Pedro","Xavi"])]

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25,NY
2,Pedro,21,New Zeland


In [14]:
#Filter with ranges
df[df["YearsOld"].between(23,26)]

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25,NY
1,Juan,23,LONDON


In [15]:
#How to applicate functions with data?

def long_5(name):
    return len(name) == 5

df[df["Name"].apply(long_5)]

Unnamed: 0,Name,YearsOld,Citys
2,Pedro,21,New Zeland


In [5]:
#DataFrame with numpy, numpy allowed worked with none dates
import numpy as np
data = {
    "Name": ["Ana", "Juan", "Pedro"],
    "YearsOld": [25,np.nan,21],
    "Citys": ["NY", "New Zeland",None]
}
data, type(data)
df = pd.DataFrame(data)
df

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25.0,NY
1,Juan,,New Zeland
2,Pedro,21.0,


In [6]:
#fill in null values, in this case if YearsOld is null, YearsOld is replaced by mean years old
df_fill = df.fillna(
    {
        "YearsOld": df["YearsOld"].mean(),
        "Citys":  "Stranger"
    }
)
df_fill

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25.0,NY
1,Juan,23.0,New Zeland
2,Pedro,21.0,Stranger


In [7]:
#remove rows with null values
df_sin_nan = df.dropna()
df_sin_nan

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25.0,NY


In [9]:
#replace specifict values in some columnn
df_reem = df.replace(
    {
        "Citys": {None:"Stranger"}
    }
)
df_reem

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25.0,NY
1,Juan,,New Zeland
2,Pedro,21.0,Stranger


In [11]:
#interpolate values
df_interpolate = df.copy()
df_interpolate["YearsOld"] = df["YearsOld"].interpolate()
df_interpolate

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25.0,NY
1,Juan,23.0,New Zeland
2,Pedro,21.0,


In [12]:
#remove fills duplicates 
data_duplicate = {
    "Name": ["Ana", "Juan", "Pedro", "Ana", "Juan"],
    "YearsOld": [25,np.nan,21,25,np.nan],
    "Citys": ["NY", "New Zeland",None,"NY", "New Zeland"]
}
df_duplicate = pd.DataFrame(data_duplicate)
df_not_duplicate = df_duplicate.drop_duplicates()
df_not_duplicate

Unnamed: 0,Name,YearsOld,Citys
0,Ana,25.0,NY
1,Juan,,New Zeland
2,Pedro,21.0,


In [13]:
#change columns names
df_rename = df.rename(columns={"Name":"Nombres", "YearsOld":"Edad", "Citys":"Ciudad"})
df_rename

Unnamed: 0,Nombres,Edad,Ciudad
0,Ana,25.0,NY
1,Juan,,New Zeland
2,Pedro,21.0,


In [14]:
#sort columns
sort_cols = ["Citys", "Name", "YearsOld"]
df_sort = df[sort_cols]
df_sort

Unnamed: 0,Citys,Name,YearsOld
0,NY,Ana,25.0
1,New Zeland,Juan,
2,,Pedro,21.0


In [15]:
#Transform data
def square(x):
    return x**2
df["square_YearsOld"] = df["YearsOld"].apply(square)
df


Unnamed: 0,Name,YearsOld,Citys,square_YearsOld
0,Ana,25.0,NY,625.0
1,Juan,,New Zeland,
2,Pedro,21.0,,441.0


In [17]:
data = {
    'Names': ['Alice', 'Bob', 'Carlos', 'Diana', 'Eva'],
    'Citys': ['New York', 'Phoenix', 'Chicago', 'New York', 'Phoenix'],
    'YearsOlds': ['25', '30', '35', '40', '45'],
    'Points': [88, 92, 75, 83, 90]
}
df = pd.DataFrame(data)
#group by by citys
grouped = df.groupby("Citys")
print(grouped.groups)

{'Chicago': [2], 'Los Phoenix': [1], 'New York': [0, 3], 'Phoenix': [4]}
