# Installation

In [None]:
!pip install pandas

# Importing Pandas

In [1]:
import pandas as pd

# Data Structures

## Series

A Series is a one-dimensional array-like object that can hold various data types.

### 1. Creating a Series:

In [5]:
import numpy as np

# From a list
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

# From a dictionary
s = pd.Series({'a': 1, 'b': 2, 'c': 3})
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
a    1
b    2
c    3
dtype: int64


### 2. Accessing Data in Series

In [7]:
print(s.iloc[0])  # First element
print(s['a'])  # Element with label 'a'
print(s[:3])  # First three elements
print(s[s > 2])  # Elements greater than 2

1
1
a    1
b    2
c    3
dtype: int64
c    3
dtype: int64


## DataFrame

A DataFrame is a two-dimensional, size-mutable, and potentially heterogeneous tabular data structure.



### 1. Creating a DataFrame:

In [9]:
# From a dictionary of lists
data = {
    'A': [1, 2, 3, 4],
    'B': [5, 6, 7, 8],
    'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)
print(df)

# From a list of dictionaries
data = [
    {'A': 1, 'B': 2},
    {'A': 3, 'B': 4, 'C': 5}
]
df = pd.DataFrame(data)
print(df)

   A  B   C
0  1  5   9
1  2  6  10
2  3  7  11
3  4  8  12
   A  B    C
0  1  2  NaN
1  3  4  5.0


### 2. Viewing Data

In [13]:
data = {
    'A': [1, 2, 3, 4, 11, 23, 4, 5, 5, 99],
    'B': [5, 6, 7, 8, 2, 3, 4, 11, 23, 4],
    'C': [9, 10, 11, 12, 3, 4, 11, 7, 8, 2]
}
df = pd.DataFrame(data)
print(df)

    A   B   C
0   1   5   9
1   2   6  10
2   3   7  11
3   4   8  12
4  11   2   3
5  23   3   4
6   4   4  11
7   5  11   7
8   5  23   8
9  99   4   2


In [14]:
print(df.head())  # First 5 rows

    A  B   C
0   1  5   9
1   2  6  10
2   3  7  11
3   4  8  12
4  11  2   3


In [15]:
print(df.tail(3))  # Last 3 rows

    A   B  C
7   5  11  7
8   5  23  8
9  99   4  2


In [16]:
print(df.info())  # Summary of the DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       10 non-null     int64
 1   B       10 non-null     int64
 2   C       10 non-null     int64
dtypes: int64(3)
memory usage: 372.0 bytes
None


In [17]:
print(df.describe())  # Statistical summary

               A         B          C
count  10.000000  10.00000  10.000000
mean   15.700000   7.30000   7.700000
std    29.966834   6.11101   3.591657
min     1.000000   2.00000   2.000000
25%     3.250000   4.00000   4.750000
50%     4.500000   5.50000   8.500000
75%     9.500000   7.75000  10.750000
max    99.000000  23.00000  12.000000


### 3. Selecting Data:

In [19]:
print(df['A'])  # Column 'A'

0     1
1     2
2     3
3     4
4    11
5    23
6     4
7     5
8     5
9    99
Name: A, dtype: int64


In [20]:
print(df[['A', 'B']])  # Multiple columns

    A   B
0   1   5
1   2   6
2   3   7
3   4   8
4  11   2
5  23   3
6   4   4
7   5  11
8   5  23
9  99   4


In [21]:
# Selecting rows by position
print(df.iloc[0])  # First row

A    1
B    5
C    9
Name: 0, dtype: int64


In [22]:
print(df.iloc[1:3])  # Second and third rows

   A  B   C
1  2  6  10
2  3  7  11


In [23]:
# Selecting rows by label
print(df.loc[0])  # First row

A    1
B    5
C    9
Name: 0, dtype: int64


In [24]:
print(df.loc[:, ['A', 'B']])  # All rows, specific columns

    A   B
0   1   5
1   2   6
2   3   7
3   4   8
4  11   2
5  23   3
6   4   4
7   5  11
8   5  23
9  99   4


### 4. Filtering Data

In [27]:
print(df[df['A'] > 2])  # Rows where column 'A' is greater than 2

    A   B   C
2   3   7  11
3   4   8  12
4  11   2   3
5  23   3   4
6   4   4  11
7   5  11   7
8   5  23   8
9  99   4   2


In [26]:
print(df[(df['A'] > 1) & (df['B'] < 8)])  # Combined conditions

    A  B   C
1   2  6  10
2   3  7  11
4  11  2   3
5  23  3   4
6   4  4  11
9  99  4   2


# Data Manipulation

### 1. Adding/Removing Columns:

In [28]:
df['D'] = df['A'] + df['B']  # New column 'D'
print(df)

    A   B   C    D
0   1   5   9    6
1   2   6  10    8
2   3   7  11   10
3   4   8  12   12
4  11   2   3   13
5  23   3   4   26
6   4   4  11    8
7   5  11   7   16
8   5  23   8   28
9  99   4   2  103


In [29]:
del df['D']  # Remove column 'D'
print(df)

    A   B   C
0   1   5   9
1   2   6  10
2   3   7  11
3   4   8  12
4  11   2   3
5  23   3   4
6   4   4  11
7   5  11   7
8   5  23   8
9  99   4   2


### 2. Renaming Columns

In [30]:
df.rename(columns={'A': 'Alpha', 'B': 'Beta'}, inplace=True) 
# # df is modified directly, if inplace = True
print(df)

   Alpha  Beta   C
0      1     5   9
1      2     6  10
2      3     7  11
3      4     8  12
4     11     2   3
5     23     3   4
6      4     4  11
7      5    11   7
8      5    23   8
9     99     4   2


In [33]:
df_renamed = df.rename(columns={'A': 'Alpha', 'B': 'Beta'}, inplace=False)
print(df_renamed)

   Alpha  Beta   C
0      1     5   9
1      2     6  10
2      3     7  11
3      4     8  12
4     11     2   3
5     23     3   4
6      4     4  11
7      5    11   7
8      5    23   8
9     99     4   2


### 3. Handling Missing Data

In [36]:
df = pd.DataFrame({
    'A': [1, 2, np.nan],
    'B': [np.nan, 2, 3],
    'C': [1, 2, 3]
})
print(df)

     A    B  C
0  1.0  NaN  1
1  2.0  2.0  2
2  NaN  3.0  3


In [35]:
# Dropping missing data
print(df.dropna())

     A    B  C
1  2.0  2.0  2


In [41]:
# Filling missing data
print(df.fillna(0))
print(df.ffill())  # Forward fill
print(df.bfill())  # Backward fill

     A    B  C
0  1.0  0.0  1
1  2.0  2.0  2
2  0.0  3.0  3
     A    B  C
0  1.0  NaN  1
1  2.0  2.0  2
2  2.0  3.0  3
     A    B  C
0  1.0  2.0  1
1  2.0  2.0  2
2  NaN  3.0  3


# Grouping and Aggregation

### 1. Grouping

In [50]:
data = {
    'A': ['foo', 'bar', 'foo', 'bar'],
#     'B': ['one', 'one', 'two', 'two'],
    'C': [1, 2, 3, 4],
    'D': [10, 20, 30, 40]
}
df = pd.DataFrame(data)
print(df)

     A  C   D
0  foo  1  10
1  bar  2  20
2  foo  3  30
3  bar  4  40


In [52]:
grouped = df.groupby('A')
print(grouped.mean())  # Mean of groups

       C     D
A             
bar  3.0  30.0
foo  2.0  20.0


### 2. Aggregation

In [53]:
print(df.groupby('A').agg({'C': 'sum', 'D': 'mean'}))

     C     D
A           
bar  6  30.0
foo  4  20.0


# Merging and Joining

### 1. Concatenation

In [54]:
df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})
df2 = pd.DataFrame({
    'A': ['A4', 'A5', 'A6', 'A7'],
    'B': ['B4', 'B5', 'B6', 'B7']
})

result = pd.concat([df1, df2])
print(result)

    A   B
0  A0  B0
1  A1  B1
2  A2  B2
3  A3  B3
0  A4  B4
1  A5  B5
2  A6  B6
3  A7  B7


### 2. Merging:

In [55]:
left = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'A': ['A0', 'A1', 'A2', 'A3']
})
right = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K4'],
    'B': ['B0', 'B1', 'B2', 'B3']
})

result = pd.merge(left, right, on='key', how='inner')  # Inner join
print(result)

  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2


### 3. Joining

In [56]:
left = left.set_index('key')
right = right.set_index('key')

result = left.join(right, how='outer')  # Outer join
print(result)

       A    B
key          
K0    A0   B0
K1    A1   B1
K2    A2   B2
K3    A3  NaN
K4   NaN   B3


# Working with Dates

### 1. Creating Date Ranges

In [63]:
rng = pd.date_range('2023-01-01', periods=10, freq='D')
print(rng)

ts = pd.Series(np.random.randn(len(rng)), index=rng)
print(ts)

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10'],
              dtype='datetime64[ns]', freq='D')
2023-01-01   -0.829694
2023-01-02   -0.349804
2023-01-03   -1.046260
2023-01-04   -0.188490
2023-01-05   -0.938989
2023-01-06   -0.309807
2023-01-07   -0.177561
2023-01-08   -0.292008
2023-01-09   -0.813268
2023-01-10   -0.184606
Freq: D, dtype: float64


### 2. Date Manipulation

In [64]:
ts = ts.cumsum()
print(ts)

2023-01-01   -0.829694
2023-01-02   -1.179497
2023-01-03   -2.225757
2023-01-04   -2.414247
2023-01-05   -3.353236
2023-01-06   -3.663043
2023-01-07   -3.840604
2023-01-08   -4.132612
2023-01-09   -4.945880
2023-01-10   -5.130486
Freq: D, dtype: float64


In [65]:
# Resampling
print(ts.resample('ME').mean())  # Monthly mean

2023-01-31   -3.171506
Freq: ME, dtype: float64


# Reading Data and Writing Data

In [None]:
df = pd.read_csv('data.csv')  # CSV file
print(df)

df = pd.read_excel('data.xlsx')  # Excel file
print(df)

df = pd.read_sql('SELECT * FROM table', connection)  # SQL query
print(df)

In [None]:
df.to_csv('output.csv')
df.to_excel('output.xlsx')
df.to_sql('table', connection)