In [2]:
import numpy as np
import pandas as pd

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.690304,0.478448,1.11237,1.464168
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871
2013-01-03,0.037398,0.277044,0.906972,0.208852
2013-01-04,1.446708,-0.963427,1.201334,1.539684
2013-01-05,1.536087,0.469709,0.344884,0.429442
2013-01-06,0.22313,0.895916,2.230487,0.012044


In [6]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",})

df2


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [7]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [8]:
# df2.A                  df2.bool
# df2.abs                df2.boxplot
# df2.add                df2.C
# df2.add_prefix         df2.clip
# df2.add_suffix         df2.columns
# df2.align              df2.copy
# df2.all                df2.count
# df2.any                df2.combine
# df2.append             df2.D
# df2.apply              df2.describe
# df2.applymap           df2.diff
# df2.B                  df2.duplicated



These are various attributes, methods, and functions associated with a DataFrame object in pandas, a popular library for data manipulation and analysis in Python.

1. `df2.A`: This accesses the column named "A" in the DataFrame `df2`.
2. `df2.bool`: This is not a standard attribute or method for a DataFrame in pandas. It may be a custom attribute or method defined elsewhere in the code.
3. `df2.abs()`: This method returns the absolute numeric value of each element in the DataFrame.
4. `df2.boxplot()`: This method generates a box plot of the numeric columns in the DataFrame.
5. `df2.add()`: This method adds a scalar, array-like, or DataFrame to the DataFrame.
6. `df2.C`: This accesses the column named "C" in the DataFrame `df2`.
7. `df2.add_prefix()`: This method adds a prefix to the column labels.
8. `df2.clip()`: This method limits the values in the DataFrame to be within a specified range.
9. `df2.add_suffix()`: This method adds a suffix to the column labels.
10. `df2.columns`: This attribute returns the column labels of the DataFrame.
11. `df2.align()`: This method aligns two DataFrames on their axes.
12. `df2.copy()`: This method creates a deep copy of the DataFrame.
13. `df2.all()`: This method returns True if all elements in the DataFrame are True, otherwise False.
14. `df2.count()`: This method counts the non-null values in each column of the DataFrame.
15. `df2.any()`: This method returns True if any element in the DataFrame is True, otherwise False.
16. `df2.combine()`: This method combines the DataFrame with another DataFrame using a given function.
17. `df2.append()`: This method appends rows of another DataFrame to the end of the current DataFrame.
18. `df2.D`: This accesses the column named "D" in the DataFrame `df2`.
19. `df2.apply()`: This method applies a function along an axis of the DataFrame.
20. `df2.describe()`: This method generates descriptive statistics of the DataFrame.
21. `df2.applymap()`: This method applies a function to every element of the DataFrame.
22. `df2.diff()`: This method calculates the difference between consecutive elements of the DataFrame.
23. `df2.B`: This accesses the column named "B" in the DataFrame `df2`.
24. `df2.duplicated()`: This method returns a boolean Series indicating duplicate rows.
  
These are just brief explanations of these DataFrame attributes, methods, and functions. For more detailed information, you can refer to the pandas documentation or experiment with them in your own Python environment.

In [9]:
df.to_numpy()

array([[-1.69030393,  0.47844783,  1.11237   ,  1.46416822],
       [ 0.47829607, -0.03017008, -0.43517095, -0.1618713 ],
       [ 0.03739816,  0.27704389,  0.90697181,  0.20885241],
       [ 1.44670774, -0.9634265 ,  1.20133381,  1.53968435],
       [ 1.53608696,  0.46970866,  0.34488402,  0.42944234],
       [ 0.2231296 ,  0.89591631,  2.23048743,  0.01204422]])

In [10]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
df2.describe()

Unnamed: 0,A,B,C,D
count,4.0,4,4.0,4.0
mean,1.0,2013-01-02 00:00:00,1.0,3.0
min,1.0,2013-01-02 00:00:00,1.0,3.0
25%,1.0,2013-01-02 00:00:00,1.0,3.0
50%,1.0,2013-01-02 00:00:00,1.0,3.0
75%,1.0,2013-01-02 00:00:00,1.0,3.0
max,1.0,2013-01-02 00:00:00,1.0,3.0
std,0.0,,0.0,0.0


In [13]:
df2.T

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [14]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.690304,0.478448,1.11237,1.464168
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871
2013-01-03,0.037398,0.277044,0.906972,0.208852
2013-01-04,1.446708,-0.963427,1.201334,1.539684
2013-01-05,1.536087,0.469709,0.344884,0.429442
2013-01-06,0.22313,0.895916,2.230487,0.012044


In [15]:
df.sort_index(axis=1, ascending=False)  # sort index for columns in descending order

Unnamed: 0,D,C,B,A
2013-01-01,1.464168,1.11237,0.478448,-1.690304
2013-01-02,-0.161871,-0.435171,-0.03017,0.478296
2013-01-03,0.208852,0.906972,0.277044,0.037398
2013-01-04,1.539684,1.201334,-0.963427,1.446708
2013-01-05,0.429442,0.344884,0.469709,1.536087
2013-01-06,0.012044,2.230487,0.895916,0.22313


In [16]:
df.sort_index(axis=0, ascending=False)  # sort index for rows in descending order

Unnamed: 0,A,B,C,D
2013-01-06,0.22313,0.895916,2.230487,0.012044
2013-01-05,1.536087,0.469709,0.344884,0.429442
2013-01-04,1.446708,-0.963427,1.201334,1.539684
2013-01-03,0.037398,0.277044,0.906972,0.208852
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871
2013-01-01,-1.690304,0.478448,1.11237,1.464168


In [17]:
df.sort_values(by="B")  # sort values by column B in ascending order

Unnamed: 0,A,B,C,D
2013-01-04,1.446708,-0.963427,1.201334,1.539684
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871
2013-01-03,0.037398,0.277044,0.906972,0.208852
2013-01-05,1.536087,0.469709,0.344884,0.429442
2013-01-01,-1.690304,0.478448,1.11237,1.464168
2013-01-06,0.22313,0.895916,2.230487,0.012044


# Getitem ([])

In [18]:
df["A"] # select column A

2013-01-01   -1.690304
2013-01-02    0.478296
2013-01-03    0.037398
2013-01-04    1.446708
2013-01-05    1.536087
2013-01-06    0.223130
Freq: D, Name: A, dtype: float64

In [19]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.690304,0.478448,1.11237,1.464168
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871
2013-01-03,0.037398,0.277044,0.906972,0.208852
2013-01-04,1.446708,-0.963427,1.201334,1.539684
2013-01-05,1.536087,0.469709,0.344884,0.429442
2013-01-06,0.22313,0.895916,2.230487,0.012044


In [20]:
df[0:3]  # select rows from 0 to 3 (inclusive)

Unnamed: 0,A,B,C,D
2013-01-01,-1.690304,0.478448,1.11237,1.464168
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871
2013-01-03,0.037398,0.277044,0.906972,0.208852


In [21]:
df["20130102":"20130104"]   # select rows from 20130102 to 20130104 (inclusive)

Unnamed: 0,A,B,C,D
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871
2013-01-03,0.037398,0.277044,0.906972,0.208852
2013-01-04,1.446708,-0.963427,1.201334,1.539684


# Selection by label

In [22]:
df.loc[dates[0]] # select row at index 0 

A   -1.690304
B    0.478448
C    1.112370
D    1.464168
Name: 2013-01-01 00:00:00, dtype: float64

In [23]:
df.loc[:, ["A", "B"]] # selects all rows and only the columns labeled "A" and "B" from the DataFrame df.

Unnamed: 0,A,B
2013-01-01,-1.690304,0.478448
2013-01-02,0.478296,-0.03017
2013-01-03,0.037398,0.277044
2013-01-04,1.446708,-0.963427
2013-01-05,1.536087,0.469709
2013-01-06,0.22313,0.895916


In [24]:
df.loc["20130102":"20130104", ["A", "B"]] # select rows from 20130102 to 20130104 (inclusive/all-in) 
# and only the columns labeled "A" and "B"

Unnamed: 0,A,B
2013-01-02,0.478296,-0.03017
2013-01-03,0.037398,0.277044
2013-01-04,1.446708,-0.963427


In [25]:
df.loc[dates[0], "A"]  # select value at index 0, column A

-1.6903039256735195

In [27]:
df.at[dates[0], "A"] # select value at index 0, column A

-1.6903039256735195

# Selection by position

In [29]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.690304,0.478448,1.11237,1.464168
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871
2013-01-03,0.037398,0.277044,0.906972,0.208852
2013-01-04,1.446708,-0.963427,1.201334,1.539684
2013-01-05,1.536087,0.469709,0.344884,0.429442
2013-01-06,0.22313,0.895916,2.230487,0.012044


The df.iloc[] method in pandas is used for integer-location based indexing. It is primarily used to access rows and columns in a DataFrame based on their integer position rather than their label.

In [28]:
df.iloc[3]  # select row at index 3 df.iloc[row_index]


A    1.446708
B   -0.963427
C    1.201334
D    1.539684
Name: 2013-01-04 00:00:00, dtype: float64

df.iloc[3:5, 0:2] would return a DataFrame containing rows with integer indices 3 and 4, and columns with integer indices 0 and 1.

So, it would return a subset of the original DataFrame df consisting of rows 3 and 4, and columns 0 and 1.

In [30]:
df.iloc[3:5, 0:2] # select rows from index 3 to 4 (inclusive) and columns from index 0 to 1 (inclusive)
# df.iloc[start_row_index:end_row_index, start_column_index:end_column_index]


Unnamed: 0,A,B
2013-01-04,1.446708,-0.963427
2013-01-05,1.536087,0.469709


In [32]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.690304,0.478448,1.11237,1.464168
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871
2013-01-03,0.037398,0.277044,0.906972,0.208852
2013-01-04,1.446708,-0.963427,1.201334,1.539684
2013-01-05,1.536087,0.469709,0.344884,0.429442
2013-01-06,0.22313,0.895916,2.230487,0.012044


In [31]:
df.iloc[[1, 2, 4], [0, 2]] # select rows at index 1, 2, and 4 and columns at index 0 and 2

Unnamed: 0,A,C
2013-01-02,0.478296,-0.435171
2013-01-03,0.037398,0.906972
2013-01-05,1.536087,0.344884


In [33]:
df.iloc[1:3, :] # select rows from index 1 to 2 (inclusive) and all columns

Unnamed: 0,A,B,C,D
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871
2013-01-03,0.037398,0.277044,0.906972,0.208852


In [34]:
df.iloc[:, 1:3] # select all rows and columns from index 1 to 2 (inclusive)


Unnamed: 0,B,C
2013-01-01,0.478448,1.11237
2013-01-02,-0.03017,-0.435171
2013-01-03,0.277044,0.906972
2013-01-04,-0.963427,1.201334
2013-01-05,0.469709,0.344884
2013-01-06,0.895916,2.230487


In [36]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.690304,0.478448,1.11237,1.464168
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871
2013-01-03,0.037398,0.277044,0.906972,0.208852
2013-01-04,1.446708,-0.963427,1.201334,1.539684
2013-01-05,1.536087,0.469709,0.344884,0.429442
2013-01-06,0.22313,0.895916,2.230487,0.012044


In [35]:
df.iloc[1, 1] # select row at index 1 and column at index 1

-0.030170079915278022

In [37]:
df.iat[1, 1] # select row at index 1 and column at index 1

-0.030170079915278022

# Boolean indexing

In [38]:
df[df["A"] > 0]     # select all rows where column A is greater than 0

Unnamed: 0,A,B,C,D
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871
2013-01-03,0.037398,0.277044,0.906972,0.208852
2013-01-04,1.446708,-0.963427,1.201334,1.539684
2013-01-05,1.536087,0.469709,0.344884,0.429442
2013-01-06,0.22313,0.895916,2.230487,0.012044


In [39]:
df[df > 0] # select all rows and columns where the value is greater than 0

Unnamed: 0,A,B,C,D
2013-01-01,,0.478448,1.11237,1.464168
2013-01-02,0.478296,,,
2013-01-03,0.037398,0.277044,0.906972,0.208852
2013-01-04,1.446708,,1.201334,1.539684
2013-01-05,1.536087,0.469709,0.344884,0.429442
2013-01-06,0.22313,0.895916,2.230487,0.012044


In [40]:
df2 = df.copy()

In [41]:
df2["E"] = ["one", "one", "two", "three", "four", "three"]

df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.690304,0.478448,1.11237,1.464168,one
2013-01-02,0.478296,-0.03017,-0.435171,-0.161871,one
2013-01-03,0.037398,0.277044,0.906972,0.208852,two
2013-01-04,1.446708,-0.963427,1.201334,1.539684,three
2013-01-05,1.536087,0.469709,0.344884,0.429442,four
2013-01-06,0.22313,0.895916,2.230487,0.012044,three


.isin(["two", "four"]): This part checks whether each value in the column "E" is present in the list ["two", "four"]. It returns a boolean Series where each element is True if the corresponding value in "E" is either "two" or "four", and False otherwise.

In [42]:
df2[df2["E"].isin(["two", "four"])] # select all rows where column E is either "two" or "four"

Unnamed: 0,A,B,C,D,E
2013-01-03,0.037398,0.277044,0.906972,0.208852,two
2013-01-05,1.536087,0.469709,0.344884,0.429442,four


# Setting


1. `df["F"] = s1`:
   - This line assigns the values from Series `s1` to a new column in DataFrame `df`, labeled "F".
   - It effectively adds a new column to the DataFrame `df` and populates it with the values from the Series `s1`.
   - If the length of `s1` matches the length of `df`, each element in `s1` is assigned to the corresponding row in the "F" column of `df`.



In [45]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6)) # create a series with a datetime index
s1
df["F"] = s1 # add the series as a column to df

In [46]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

2. `df.at[dates[0], "A"] = 0`:
   - This line accesses and modifies a single element in DataFrame `df` at the intersection of the row labeled `dates[0]` and the column labeled "A".
   - `df.at[]` is a method optimized for accessing and modifying a single scalar value in a DataFrame.
   - It sets the value at the specified location (row `dates[0]`, column "A") to `0`.
   - This is an efficient way to directly modify a specific element in the DataFrame without needing to create a boolean mask or use other indexing methods.

Overall, these two lines of code demonstrate how to add a new column to a DataFrame and how to modify a specific element within it.

In [47]:
df.at[dates[0], "A"] = 0 # set value at row 0, column A to 0

In [48]:
df.iat[0, 1] = 0  # set value at row 0, column B to 0

np.array([5] * len(df)): This creates a NumPy array containing the value 5 repeated len(df) times. This repetition ensures that the length of the array matches the number of rows in DataFrame df.

df.loc[:, "D"]: This accesses all rows (:) and the column labeled "D" in DataFrame df. It selects all rows in the DataFrame and the column labeled "D" specifically.

=: This assignment operator assigns the values from the NumPy array to the selected portion of the DataFrame.

Putting it all together, the line of code assigns the value 5 to every element in the column "D" of DataFrame df. It effectively adds a new column "D" to the DataFrame if it doesn't already exist and populates it with the value 5. The loc function is used here for label-based indexing, ensuring that the assignment occurs in the correct column.

In [49]:
df.loc[:, "D"] = np.array([5] * len(df)) # set value at row 0, column D to 5

In [50]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.11237,5.0,
2013-01-02,0.478296,-0.03017,-0.435171,5.0,1.0
2013-01-03,0.037398,0.277044,0.906972,5.0,2.0
2013-01-04,1.446708,-0.963427,1.201334,5.0,3.0
2013-01-05,1.536087,0.469709,0.344884,5.0,4.0
2013-01-06,0.22313,0.895916,2.230487,5.0,5.0


In [52]:
df2 = df.copy()  # create a copy of df
df2[df2 > 0] = -df2    # set all values in df2 that are greater than 0 to -df2 (i.e., flip the sign)

df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.11237,-5.0,
2013-01-02,-0.478296,-0.03017,-0.435171,-5.0,-1.0
2013-01-03,-0.037398,-0.277044,-0.906972,-5.0,-2.0
2013-01-04,-1.446708,-0.963427,-1.201334,-5.0,-3.0
2013-01-05,-1.536087,-0.469709,-0.344884,-5.0,-4.0
2013-01-06,-0.22313,-0.895916,-2.230487,-5.0,-5.0
