In [68]:
import pandas as pd
import numpy as np

#### Pandas Series

In [69]:
# From list 
s = pd.Series([10,20,30,40]) # default index starts from 0
print(s)
print('\n=============')
# From a list with custom index
s = pd.Series([10,20,30,40],index=['A','B','C','D'])
print(s)
print('\n=============')
# From a dictionary
s = pd.Series({'x':100,'y':200,'z':300})
print(s)
print('\n=============')

0    10
1    20
2    30
3    40
dtype: int64

A    10
B    20
C    30
D    40
dtype: int64

x    100
y    200
z    300
dtype: int64



#### Pandas Dataframes

In [70]:
# From a dictionary of lists
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Paris', 'London']
}
df = pd.DataFrame(data)
print(df)
print('\n=============')
# From a list of dictionaries
data2 = [
    {'Name': 'Alice', 'Age': 25},
    {'Name': 'Bob', 'Age': 30}
]
df2 = pd.DataFrame(data2)
print(df2)
print('\n=============')

      Name  Age      City
0    Alice   25  New York
1      Bob   30     Paris
2  Charlie   35    London

    Name  Age
0  Alice   25
1    Bob   30



#### Common Pandas Functions

In [71]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, None, 40, 35],
    'City': ['NY', 'LA', 'NY', 'Chicago', None],
    'Score': [85, 90, 78, 92, 88]
}
df = pd.DataFrame(data)

df2 = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Frank'],
    'Salary': [70000, 80000, 65000]
})
print("head() : Show few top entries")
df.head()

head() : Show few top entries


Unnamed: 0,Name,Age,City,Score
0,Alice,25.0,NY,85
1,Bob,30.0,LA,90
2,Charlie,,NY,78
3,David,40.0,Chicago,92
4,Eva,35.0,,88


In [72]:
print('tail() : Show few last entries')
df.tail(3)

tail() : Show few last entries


Unnamed: 0,Name,Age,City,Score
2,Charlie,,NY,78
3,David,40.0,Chicago,92
4,Eva,35.0,,88


In [73]:
print('isnull() : Checking Missing values')
df.isnull()

isnull() : Checking Missing values


Unnamed: 0,Name,Age,City,Score
0,False,False,False,False
1,False,False,False,False
2,False,True,False,False
3,False,False,False,False
4,False,False,True,False


In [74]:
print('fillna() : Replace missing values --> numerical values with the mean of rest and categorical values with the first mode of the rest')
df = df.fillna({'Age': round(df['Age'].mean()) , 'City': df['City'].mode().iloc[0]})
df

fillna() : Replace missing values --> numerical values with the mean of rest and categorical values with the first mode of the rest


Unnamed: 0,Name,Age,City,Score
0,Alice,25.0,NY,85
1,Bob,30.0,LA,90
2,Charlie,32.0,NY,78
3,David,40.0,Chicago,92
4,Eva,35.0,NY,88


In [75]:
print('rename() : renaming a column')
df = df.rename(columns={'Score' : 'ExamScore'})
df

rename() : renaming a column


Unnamed: 0,Name,Age,City,ExamScore
0,Alice,25.0,NY,85
1,Bob,30.0,LA,90
2,Charlie,32.0,NY,78
3,David,40.0,Chicago,92
4,Eva,35.0,NY,88


In [76]:
print('groupby() : groups data by a particular column')
grouped = df.groupby('City')['ExamScore'].mean().round(2)
grouped

groupby() : groups data by a particular column


City
Chicago    92.00
LA         90.00
NY         83.67
Name: ExamScore, dtype: float64

In [77]:
print('merge() : merges to dataframes')
merged_df = pd.merge(df,df2,on='Name',how='inner') # inner -> intersection , outer -> union
merged_df

merge() : merges to dataframes


Unnamed: 0,Name,Age,City,ExamScore,Salary
0,Alice,25.0,NY,85,70000
1,Bob,30.0,LA,90,80000


In [78]:
print('columns : to print the header')
print(df.columns)

columns : to print the header
Index(['Name', 'Age', 'City', 'ExamScore'], dtype='object')


In [79]:
# Access a column
print(df["Name"])

# Access a row by index
print(df.iloc[1])  # Second row

# Access a row by label
print(df.loc[0])  # First row

# Access a specific (R,C)
print(df.iloc[0, 1])  # First row, second column

0      Alice
1        Bob
2    Charlie
3      David
4        Eva
Name: Name, dtype: object
Name          Bob
Age          30.0
City           LA
ExamScore      90
Name: 1, dtype: object
Name         Alice
Age           25.0
City            NY
ExamScore       85
Name: 0, dtype: object
25.0


In [80]:
# Filter rows where Age > 25
filtered_df = df.loc[df["Age"] > 33]
print(filtered_df)

    Name   Age     City  ExamScore
3  David  40.0  Chicago         92
4    Eva  35.0       NY         88


In [81]:
# Add a new column
df["Salary"] = [50000, 60000, 70000,100000,80000]
print(df)

      Name   Age     City  ExamScore  Salary
0    Alice  25.0       NY         85   50000
1      Bob  30.0       LA         90   60000
2  Charlie  32.0       NY         78   70000
3    David  40.0  Chicago         92  100000
4      Eva  35.0       NY         88   80000


In [82]:
print(df.describe())  # Summary statistics
print(df.info())      # DataFrame information

            Age  ExamScore         Salary
count   5.00000   5.000000       5.000000
mean   32.40000  86.600000   72000.000000
std     5.59464   5.458938   19235.384062
min    25.00000  78.000000   50000.000000
25%    30.00000  85.000000   60000.000000
50%    32.00000  88.000000   70000.000000
75%    35.00000  90.000000   80000.000000
max    40.00000  92.000000  100000.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Name       5 non-null      object 
 1   Age        5 non-null      float64
 2   City       5 non-null      object 
 3   ExamScore  5 non-null      int64  
 4   Salary     5 non-null      int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 332.0+ bytes
None


In [83]:
# Sort by Age
df_sorted = df.sort_values(by="Age", ascending=False)
print(df_sorted)

      Name   Age     City  ExamScore  Salary
3    David  40.0  Chicago         92  100000
4      Eva  35.0       NY         88   80000
2  Charlie  32.0       NY         78   70000
1      Bob  30.0       LA         90   60000
0    Alice  25.0       NY         85   50000


In [84]:
# Counts of a specific column
City_counts = df['City'].value_counts()
print(City_counts)

City
NY         3
LA         1
Chicago    1
Name: count, dtype: int64


In [85]:
# conditional changes
df.loc[df["Age"] > 33, "Salary"] = 120000
print(df)

      Name   Age     City  ExamScore  Salary
0    Alice  25.0       NY         85   50000
1      Bob  30.0       LA         90   60000
2  Charlie  32.0       NY         78   70000
3    David  40.0  Chicago         92  120000
4      Eva  35.0       NY         88  120000


In [86]:
print('Concatenation')
df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
df2 = pd.DataFrame({"A": [5, 6], "B": [7, 8]})
result = pd.concat([df1, df2])
print(result)

Concatenation
   A  B
0  1  3
1  2  4
0  5  7
1  6  8


In [87]:
new_rows = [{'Name': 'Aman','Age':20,'City':'California','ExamScore':100,'Salary':500000},
            {'Name': 'Ayush','Age':23,'City':'Hawai','ExamScore':50,'Salary':1000}] 
df = pd.concat([df,pd.DataFrame(new_rows)],ignore_index=True)
print(df)
print('Outlier Detection in Salary')
Q1 = df['Salary'].quantile(0.25)
Q3 = df['Salary'].quantile(0.75)
IQR = Q3 - Q1
high = Q3 + 1.5*IQR
low = Q1 - 1.5*IQR
outliers = df[(df['Salary']<low) | (df['Salary']>high)]
outliers

      Name   Age        City  ExamScore  Salary
0    Alice  25.0          NY         85   50000
1      Bob  30.0          LA         90   60000
2  Charlie  32.0          NY         78   70000
3    David  40.0     Chicago         92  120000
4      Eva  35.0          NY         88  120000
5     Aman  20.0  California        100  500000
6    Ayush  23.0       Hawai         50    1000
Outlier Detection in Salary


Unnamed: 0,Name,Age,City,ExamScore,Salary
5,Aman,20.0,California,100,500000
