In [4]:
import pandas as pd 
import numpy as np

In [3]:
print(f'Current pandas version: {pd.__version__}')

Current pandas version: 2.2.3


In [7]:
lst = [1,2,3,4,5,6]
arr = np.array(lst)
pd.Series(arr)

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [9]:
#giving custom index for series 
pd.Series(data=['Deepinder','Lovepreet','Jashan','Sukhjinder'], index=[3,2,4,1])

3     Deepinder
2     Lovepreet
4        Jashan
1    Sukhjinder
dtype: object

In [10]:
#series with dictionary 
steps = {'day_1':2000,'day_2':4000,'day_3':12000}
pd.Series(steps)

day_1     2000
day_2     4000
day_3    12000
dtype: int64

#### Repeat function - used to use duplicate values defined number of times

In [16]:
pd.Series([12]).repeat(3)

0    12
0    12
0    12
dtype: int64

#### using reset to make index accurate

In [19]:
pd.Series([12]).repeat(3).reset_index() #this returns a dataframe with new index
pd.Series([12]).repeat(3).reset_index(drop=True) #this just replaces the old index.

0    12
1    12
2    12
dtype: int64

#### more control on "repeat"

In [22]:
s = pd.Series([15,20]).repeat([3,2]) #this repeats 15 3 times and 20 2 times

#### Accessing elements

In [28]:
s[0]
s = pd.Series([15,20]).repeat([3,2]).reset_index(drop=True)
s[4] #prints the value that is at index 4 

np.int64(20)

#### Aggregate function - it is discontinued so we can use alternatives (given below)

In [40]:
ser = pd.Series([1,2,3,4,5,6,7])
# ser.agg([min]) 
print(f'Min value: {ser.min()}')
print(f'Max value: {ser.max()}')
print(f'Sum value: {ser.sum()}')

Min value: 1
Max value: 7
Sum value: 28


In [43]:
#absolute function
ser = pd.Series([1,-2,3,-4,5,-6,7])
ser.abs()

0    1
1    2
2    3
3    4
4    5
5    6
6    7
dtype: int64

#### Appending two series to one.

In [47]:
ser = pd.Series([1,-2,3,-4,5,-6,7])
ser2 = pd.Series([1,2,3,4,5,6,7])
ser._append(ser2)
# ser._append(ser2).reset_index(drop=True)

0     1
1    -2
2     3
3    -4
4     5
5    -6
6     7
7     1
8     2
9     3
10    4
11    5
12    6
13    7
dtype: int64

#### Astype

In [51]:
ser = pd.Series([1,2,3,4,5])
print(f'Type of the data before: {type(ser[0])}')
ser = ser.astype('str')
print(f'Type of the data after: {type(ser[0])}')

Type of the data before: <class 'numpy.int64'>
Type of the data after: <class 'str'>


#### Between

In [52]:
ser = pd.Series([1,2,3,4,5,6,7,8,9,10])
ser.between(5,7)

0    False
1    False
2    False
3    False
4     True
5     True
6     True
7    False
8    False
9    False
dtype: bool

#### String functions - 

In [58]:
ser = pd.Series(['Deep','Inder','Singh','Brar'])
ser.str.upper()
ser.str.lower()
ser.str.contains('e')
ser.str.replace('e','E')
ser.str.count('e')
# ser.str.upper()

0    2
1    1
2    0
3    0
dtype: int64

#### Creating a Dataframe - 

In [60]:
df = pd.DataFrame([1,2,3,4,5,6,7,8,9])
df

In [64]:
pd.DataFrame({'name':['Deepinder','Navu','Dilpreet'], 'age':[19,25,31]})

Unnamed: 0,name,age
0,Deepinder,19
1,Navu,25
2,Dilpreet,31


#### Slicing in Dataframes using loc and iloc

In [70]:
data = {
    'one':[1,2,3,4],
    'two':[10,20,30,40],
    'three':[100,200,300,400],
    'four':[1000,2000,3000,4000]
}
df = pd.DataFrame(data)
df.loc[1:2]
df.loc[1:2,'two':'three']

Unnamed: 0,two,three
1,20,200
2,30,300


In [79]:
#iloc
df.iloc[:2,:] #first : is for rows and second : is for columns 
# df.iloc[:2,1:4]

#we can also use list for this - 
df.iloc[[0,1],[1,3]] #here only those will be visible that you specify in the list, as you have the column 1 and 3 so only two and four are visible.


Unnamed: 0,two,four
0,10,1000
1,20,2000


In [81]:
#adding new column to existing dataframe
fifth = [1,2,3,4]
df['fifth'] = fifth
df

Unnamed: 0,one,two,three,four,fifth
0,1,10,100,1000,1
1,2,20,200,2000,2
2,3,30,300,3000,3
3,4,40,400,4000,4


#### Deleting a column - 

In [82]:
df.pop('fifth')

0    1
1    2
2    3
3    4
Name: fifth, dtype: int64

#### Transpose - converts the rows to columns and vice versa

In [85]:
df.T #the column names are replaces wtih rows name and vice versa 

Unnamed: 0,0,1,2,3
one,1,2,3,4
two,10,20,30,40
three,100,200,300,400
four,1000,2000,3000,4000


- Axes - returns a list containing the names of rows and columns.

In [88]:
df.axes

[RangeIndex(start=0, stop=4, step=1),
 Index(['one', 'two', 'three', 'four'], dtype='object')]

- ndim : returns the number of dimensions your data has 

In [89]:
df.ndim

2

- dtypes : tells what kind of data each row has

In [90]:
df.dtypes

one      int64
two      int64
three    int64
four     int64
dtype: object

#### Mean, Median, Mod 

In [99]:
mean = df.mean()
median = df.median()
mod = df['one'].mode()
print(f'Mean: {mean}\nMedian:{median}\nMod:{mod}')

Mean: one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64
Median:one         2.5
two        25.0
three     250.0
four     2500.0
dtype: float64
Mod:0    1
1    2
2    3
3    4
Name: one, dtype: int64


#### Min, Max, Standard Deviation

In [104]:
df.min()
# df.max()
df.std()

one         1.290994
two        12.909944
three     129.099445
four     1290.994449
dtype: float64

- pipe method : used to call a function on all the values of a dataframe 

In [107]:
def add_(i,j):
    return i+j
df.pipe(add_,10) #added 10 to each value of every column

Unnamed: 0,one,two,three,four
0,11,20,110,1010
1,12,30,210,2010
2,13,40,310,3010
3,14,50,410,4010


- apply : can be used to apply some things to our dataframe 

In [114]:
df.apply(np.mean)
df.apply(lambda x: x.max() - x.min()) #subtracts the max value with the min value.

one         3
two        30
three     300
four     3000
dtype: int64

#### Renaming the columns - 

In [117]:
df.rename(columns={'one':'ONE','two':'TWO'}, index={0:'a',1:'b',2:'c',3:'d'})

Unnamed: 0,ONE,TWO,three,four
a,1,10,100,1000
b,2,20,200,2000
c,3,30,300,3000
d,4,40,400,4000
