In [34]:
import pandas as pd
import numpy as np

In [5]:
s = pd.Series([1,2,3,4,5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
s.dtype

dtype('int64')

In [4]:
s.values

array([1, 2, 3, 4, 5], dtype=int64)

In [6]:
s.index

RangeIndex(start=0, stop=5, step=1)

In [13]:
s.name = 'calories'
s

0    1
1    2
2    3
3    4
4    5
Name: calories, dtype: int64

<h2>Indexing</h2>

In [8]:
s[0]

1

In [9]:
s[0:2]

0    1
1    2
Name: numbers, dtype: int64

In [10]:
s[2:4]

2    3
3    4
Name: numbers, dtype: int64

In [11]:
#iloc -> location based indexing
s.iloc[3]

4

In [12]:
s.iloc[[1,3,4]]

1    2
3    4
4    5
Name: numbers, dtype: int64

In [15]:
index = ['apple','bannana','grapes','orange','strawberry']
s.index =index
s

apple         1
bannana       2
grapes        3
orange        4
strawberry    5
Name: calories, dtype: int64

In [16]:
s['grapes']

3

In [17]:
#loc -> label based indexing,start and stop both are included
s.loc['grapes']

3

In [18]:
s['bannana':'orange']

bannana    2
grapes     3
orange     4
Name: calories, dtype: int64

In [20]:
fruit_protein = {
    "Avocado": 2.0,      
    "Guava": 2.6,
    "Blackberries": 2.0,
    "Oranges": 0.9,
    "Banana": 1.1,
    "Apples": 0.3,
    "Strawberries": 0.8,
    "Raspberries": 1.5,
    "Pineapple": 0.5,
    "Mango": 0.8,
    "Blueberries": 0.7,
    "Pomegranate": 1.7,
    "Papaya": 0.5,
    "Watermelon": 0.6,
    "Cantaloupe": 0.8,
    "Cherries": 1.0,
    "Kiwi": 1.1,
    "Grapes": 0.6,
    "Peach": 0.9,
    "Pear": 0.4
}
s2 = pd.Series(fruit_protein)
s2

Avocado         2.0
Guava           2.6
Blackberries    2.0
Oranges         0.9
Banana          1.1
Apples          0.3
Strawberries    0.8
Raspberries     1.5
Pineapple       0.5
Mango           0.8
Blueberries     0.7
Pomegranate     1.7
Papaya          0.5
Watermelon      0.6
Cantaloupe      0.8
Cherries        1.0
Kiwi            1.1
Grapes          0.6
Peach           0.9
Pear            0.4
dtype: float64

In [21]:
s2.name = 'Proteins'

In [22]:
s2

Avocado         2.0
Guava           2.6
Blackberries    2.0
Oranges         0.9
Banana          1.1
Apples          0.3
Strawberries    0.8
Raspberries     1.5
Pineapple       0.5
Mango           0.8
Blueberries     0.7
Pomegranate     1.7
Papaya          0.5
Watermelon      0.6
Cantaloupe      0.8
Cherries        1.0
Kiwi            1.1
Grapes          0.6
Peach           0.9
Pear            0.4
Name: Proteins, dtype: float64

<h2>Conditional Selection</h2>

In [24]:
s2[s2>1]

Avocado         2.0
Guava           2.6
Blackberries    2.0
Banana          1.1
Raspberries     1.5
Pomegranate     1.7
Kiwi            1.1
Name: Proteins, dtype: float64

<h2>Logical Operators</h2>

In [27]:
s2[(s2>0.5) & (s2<2)]

Oranges         0.9
Banana          1.1
Strawberries    0.8
Raspberries     1.5
Mango           0.8
Blueberries     0.7
Pomegranate     1.7
Watermelon      0.6
Cantaloupe      0.8
Cherries        1.0
Kiwi            1.1
Grapes          0.6
Peach           0.9
Name: Proteins, dtype: float64

In [28]:
s2[(s2>0.5) | (s2<2)]

Avocado         2.0
Guava           2.6
Blackberries    2.0
Oranges         0.9
Banana          1.1
Apples          0.3
Strawberries    0.8
Raspberries     1.5
Pineapple       0.5
Mango           0.8
Blueberries     0.7
Pomegranate     1.7
Papaya          0.5
Watermelon      0.6
Cantaloupe      0.8
Cherries        1.0
Kiwi            1.1
Grapes          0.6
Peach           0.9
Pear            0.4
Name: Proteins, dtype: float64

In [30]:
s2[~(s2>1)]

Oranges         0.9
Apples          0.3
Strawberries    0.8
Pineapple       0.5
Mango           0.8
Blueberries     0.7
Papaya          0.5
Watermelon      0.6
Cantaloupe      0.8
Cherries        1.0
Grapes          0.6
Peach           0.9
Pear            0.4
Name: Proteins, dtype: float64

<h2>Modifying a Series</h2>

In [32]:
s2['Mango'] = 2.8
s2

Avocado         2.0
Guava           2.6
Blackberries    2.0
Oranges         0.9
Banana          1.1
Apples          0.3
Strawberries    0.8
Raspberries     1.5
Pineapple       0.5
Mango           2.8
Blueberries     0.7
Pomegranate     1.7
Papaya          0.5
Watermelon      0.6
Cantaloupe      0.8
Cherries        1.0
Kiwi            1.1
Grapes          0.6
Peach           0.9
Pear            0.4
mango           2.8
Name: Proteins, dtype: float64

<h2>DataFrames</h2>

In [35]:
data = {
    "name": ["Alice", "Bob", "Charlie", "Diana", "Ethan", "Fiona", "George", "Hannah"],
    "age": [25, 30, 28, np.nan, 40, 26, 32, 29],
    "salary": [50000, 60000, 55000, 75000, 80000, np.nan, 67000, 58000],
    "dept": ["HR", "Engineering", "Sales", "Marketing", "Engineering", "HR", "Finance", "Sales"]
}
data

{'name': ['Alice',
  'Bob',
  'Charlie',
  'Diana',
  'Ethan',
  'Fiona',
  'George',
  'Hannah'],
 'age': [25, 30, 28, nan, 40, 26, 32, 29],
 'salary': [50000, 60000, 55000, 75000, 80000, nan, 67000, 58000],
 'dept': ['HR',
  'Engineering',
  'Sales',
  'Marketing',
  'Engineering',
  'HR',
  'Finance',
  'Sales']}

In [36]:
df = pd.DataFrame(data)
df

Unnamed: 0,name,age,salary,dept
0,Alice,25.0,50000.0,HR
1,Bob,30.0,60000.0,Engineering
2,Charlie,28.0,55000.0,Sales
3,Diana,,75000.0,Marketing
4,Ethan,40.0,80000.0,Engineering
5,Fiona,26.0,,HR
6,George,32.0,67000.0,Finance
7,Hannah,29.0,58000.0,Sales


In [37]:
df.head()

Unnamed: 0,name,age,salary,dept
0,Alice,25.0,50000.0,HR
1,Bob,30.0,60000.0,Engineering
2,Charlie,28.0,55000.0,Sales
3,Diana,,75000.0,Marketing
4,Ethan,40.0,80000.0,Engineering


In [38]:
df.tail(3)

Unnamed: 0,name,age,salary,dept
5,Fiona,26.0,,HR
6,George,32.0,67000.0,Finance
7,Hannah,29.0,58000.0,Sales


In [40]:
df.iloc[1:3]

Unnamed: 0,name,age,salary,dept
1,Bob,30.0,60000.0,Engineering
2,Charlie,28.0,55000.0,Sales


In [41]:
df.loc[1:3,['age','dept']]

Unnamed: 0,age,dept
1,30.0,Engineering
2,28.0,Sales
3,,Marketing


In [42]:
df['age']

0    25.0
1    30.0
2    28.0
3     NaN
4    40.0
5    26.0
6    32.0
7    29.0
Name: age, dtype: float64

In [None]:
df.drop('age',axis=1,inplace=True) # default false


In [46]:
df

Unnamed: 0,name,salary,dept
0,Alice,50000.0,HR
1,Bob,60000.0,Engineering
2,Charlie,55000.0,Sales
3,Diana,75000.0,Marketing
4,Ethan,80000.0,Engineering
5,Fiona,,HR
6,George,67000.0,Finance
7,Hannah,58000.0,Sales


In [47]:
df.shape

(8, 3)

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    8 non-null      object 
 1   salary  7 non-null      float64
 2   dept    8 non-null      object 
dtypes: float64(1), object(2)
memory usage: 324.0+ bytes


In [49]:
df.describe()

Unnamed: 0,salary
count,7.0
mean,63571.428571
std,10906.529281
min,50000.0
25%,56500.0
50%,60000.0
75%,71000.0
max,80000.0


<h2>Broadcasting</h2>

In [51]:
df['salary'] = df['salary'] + 5000
df

Unnamed: 0,name,salary,dept
0,Alice,60000.0,HR
1,Bob,70000.0,Engineering
2,Charlie,65000.0,Sales
3,Diana,85000.0,Marketing
4,Ethan,90000.0,Engineering
5,Fiona,,HR
6,George,77000.0,Finance
7,Hannah,68000.0,Sales


In [52]:
df.rename(columns={'dept':'Department'},inplace=True)

In [53]:
df['salary'].unique()

array([60000., 70000., 65000., 85000., 90000.,    nan, 77000., 68000.])

In [54]:
df['Department'].value_counts()

Department
HR             2
Engineering    2
Sales          2
Marketing      1
Finance        1
Name: count, dtype: int64

In [55]:
df['Promoted Salary'] = df['salary'] * 10

In [56]:
df

Unnamed: 0,name,salary,Department,Promoted Salary
0,Alice,60000.0,HR,600000.0
1,Bob,70000.0,Engineering,700000.0
2,Charlie,65000.0,Sales,650000.0
3,Diana,85000.0,Marketing,850000.0
4,Ethan,90000.0,Engineering,900000.0
5,Fiona,,HR,
6,George,77000.0,Finance,770000.0
7,Hannah,68000.0,Sales,680000.0


<h2>Data Cleaning</h2>

In [57]:
df.isnull().sum()

name               0
salary             1
Department         0
Promoted Salary    1
dtype: int64

In [58]:
df.dropna()

Unnamed: 0,name,salary,Department,Promoted Salary
0,Alice,60000.0,HR,600000.0
1,Bob,70000.0,Engineering,700000.0
2,Charlie,65000.0,Sales,650000.0
3,Diana,85000.0,Marketing,850000.0
4,Ethan,90000.0,Engineering,900000.0
6,George,77000.0,Finance,770000.0
7,Hannah,68000.0,Sales,680000.0


In [59]:
df

Unnamed: 0,name,salary,Department,Promoted Salary
0,Alice,60000.0,HR,600000.0
1,Bob,70000.0,Engineering,700000.0
2,Charlie,65000.0,Sales,650000.0
3,Diana,85000.0,Marketing,850000.0
4,Ethan,90000.0,Engineering,900000.0
5,Fiona,,HR,
6,George,77000.0,Finance,770000.0
7,Hannah,68000.0,Sales,680000.0


In [61]:
df.dropna(how='all') # and all

Unnamed: 0,name,salary,Department,Promoted Salary
0,Alice,60000.0,HR,600000.0
1,Bob,70000.0,Engineering,700000.0
2,Charlie,65000.0,Sales,650000.0
3,Diana,85000.0,Marketing,850000.0
4,Ethan,90000.0,Engineering,900000.0
5,Fiona,,HR,
6,George,77000.0,Finance,770000.0
7,Hannah,68000.0,Sales,680000.0


In [64]:
df['salary'].fillna(df['salary'].mean())

0    60000.000000
1    70000.000000
2    65000.000000
3    85000.000000
4    90000.000000
5    73571.428571
6    77000.000000
7    68000.000000
Name: salary, dtype: float64

In [65]:
df['salary'].fillna(method='ffill')

  df['salary'].fillna(method='ffill')


0    60000.0
1    70000.0
2    65000.0
3    85000.0
4    90000.0
5    90000.0
6    77000.0
7    68000.0
Name: salary, dtype: float64

In [66]:
df['name'] = df['name'].replace('Charlie','Aditya')

In [67]:
df

Unnamed: 0,name,salary,Department,Promoted Salary
0,Alice,60000.0,HR,600000.0
1,Bob,70000.0,Engineering,700000.0
2,Aditya,65000.0,Sales,650000.0
3,Diana,85000.0,Marketing,850000.0
4,Ethan,90000.0,Engineering,900000.0
5,Fiona,,HR,
6,George,77000.0,Finance,770000.0
7,Hannah,68000.0,Sales,680000.0


In [68]:
dup_df = df[df.duplicated(keep='first')] 

In [69]:
dup_df

Unnamed: 0,name,salary,Department,Promoted Salary


In [70]:
df = df.drop_duplicates()

In [71]:
df

Unnamed: 0,name,salary,Department,Promoted Salary
0,Alice,60000.0,HR,600000.0
1,Bob,70000.0,Engineering,700000.0
2,Aditya,65000.0,Sales,650000.0
3,Diana,85000.0,Marketing,850000.0
4,Ethan,90000.0,Engineering,900000.0
5,Fiona,,HR,
6,George,77000.0,Finance,770000.0
7,Hannah,68000.0,Sales,680000.0


In [None]:
name = 'alice_fernandes'
df[['first','last']] = df['name'].str.split('_')

In [73]:
# apply and lamda
def mul(x):
    return x*2
df['salary'] = df['salary'].apply(mul)
df

Unnamed: 0,name,salary,Department,Promoted Salary
0,Alice,240000.0,HR,600000.0
1,Bob,280000.0,Engineering,700000.0
2,Aditya,260000.0,Sales,650000.0
3,Diana,340000.0,Marketing,850000.0
4,Ethan,360000.0,Engineering,900000.0
5,Fiona,,HR,
6,George,308000.0,Finance,770000.0
7,Hannah,272000.0,Sales,680000.0


In [74]:
df['salary'] = df['salary'].apply(lambda x : x/2)

In [75]:
df

Unnamed: 0,name,salary,Department,Promoted Salary
0,Alice,120000.0,HR,600000.0
1,Bob,140000.0,Engineering,700000.0
2,Aditya,130000.0,Sales,650000.0
3,Diana,170000.0,Marketing,850000.0
4,Ethan,180000.0,Engineering,900000.0
5,Fiona,,HR,
6,George,154000.0,Finance,770000.0
7,Hannah,136000.0,Sales,680000.0


<h2>Joins and Merges</h2>

In [77]:
dept_info = {
    "dept": ["HR", "Engineering", "Sales", "Marketing", "Finance"],
    "location": ["New York", "San Francisco", "Chicago", "Los Angeles", "Boston"],
    "manager": ["Susan Miller", "James Lee", "Nina Patel", "Carlos Gomez", "Rachel Green"]
}
df2=pd.DataFrame(dept_info)
df2

Unnamed: 0,dept,location,manager
0,HR,New York,Susan Miller
1,Engineering,San Francisco,James Lee
2,Sales,Chicago,Nina Patel
3,Marketing,Los Angeles,Carlos Gomez
4,Finance,Boston,Rachel Green


In [79]:
pd.concat([df,df2],axis=1)


Unnamed: 0,name,salary,Department,Promoted Salary,dept,location,manager
0,Alice,120000.0,HR,600000.0,HR,New York,Susan Miller
1,Bob,140000.0,Engineering,700000.0,Engineering,San Francisco,James Lee
2,Aditya,130000.0,Sales,650000.0,Sales,Chicago,Nina Patel
3,Diana,170000.0,Marketing,850000.0,Marketing,Los Angeles,Carlos Gomez
4,Ethan,180000.0,Engineering,900000.0,Finance,Boston,Rachel Green
5,Fiona,,HR,,,,
6,George,154000.0,Finance,770000.0,,,
7,Hannah,136000.0,Sales,680000.0,,,


In [None]:
#pd.read_csv()