In [17]:
'''
INTRODUCTION TO PANDAS 🐼
Python library for data analysis and manipulation

Two main data structures:
1D - Series (list)
2D - DataFrame (matrix)

Why?
- Support for CSV, Excel, SQL, JSON etc.
- Powerful data aggregation and grouping
- Easy handling of missing data
- Convenient for filtering, joining, and reshaping
'''

import pandas as pd
import numpy as np

# Dictionary to DataFrame
friends = {
    'name': ['Fardin', 'Adnan', 'Shahnawaz'],
    'city': ['Pune', 'Mumbai', 'Solapur'],
    'marks': [98, 99, 100]
}

# Create DataFrame
df = pd.DataFrame(friends)
df  # show

# Save to CSV with and without index
df.to_csv('friends.csv')  
df.to_csv('friends.csv', index=False)

# Top, bottom, summary
df.head(1)
df.tail(1)
df.describe()

# Read from CSV
new = pd.read_csv('new.csv')
new  # show

new.describe()
# matrix - [row][column]
# df - [col][row]
new['city']
# specific values
new['city'][3]
# specific values update
new.loc[3, 'city'] = 'Shrinagar'
new
new.to_csv('new.csv', index=False)
new

# Custom index
new.index = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
new

# Series example
sr = pd.Series([1,2,3,4,5,6,7,8,9,10])
sr
type(sr)
type(df)

# Random Series
sr1 = pd.Series(np.random.randint(1, 11, size=5))
sr1

# Random DataFrame
randomDf = pd.DataFrame(np.random.randint(1, 101, size=(5,5)))
randomDf

# With explicit index
randomDf = pd.DataFrame(np.random.randint(1, 101, size=(5,5)), index=np.arange(5))
randomDf
randomDf.describe()

# Larger random DataFrame
randomDf = pd.DataFrame(np.random.randint(1, 101, size=(300,5)), index=np.arange(300))
randomDf
randomDf.head(5)
randomDf.tail(5)
randomDf.index
randomDf.columns
randomDf.to_numpy()

# Custom Series with index
s1 = pd.Series([10,20,30], index=['a','b','c'])
s1
s1['a']

# Modify dictionary before DataFrame
friends['Number'] = ['1','2','3']
friends

# Create DataFrame from list of lists
df1 = pd.DataFrame([
    ['Tom', 28],
    ['Jerry', 31]
], columns=['Name', 'Age'])

df1['City'] = ['Pune', 'Mumbai']
df1

# Drop column (view only)
df1.drop('Age', axis=1)

# Drop column permanently
df1.drop('Age', axis=1, inplace=True)
df1

# Add new column
df1['Number'] = ['1', '2']
df1

# Access a column
df1['Name']

randomDf.sort_index(axis = 0, ascending=False)

randomDf.sort_index(axis = 1, ascending=False)


df1
df1.columns = ['a','b','c']
df1
df1.columns = ['Name','City','Number']
df1


df1['Name'][0]
# Not a recommended way of updation
# df1['Name'][0] = 'Mr Tom'
# df1

# recommended way to update
# df1.loc[0,'Name'] = 'Tom'
# df1

# randomDf.loc[(randomDf[0]<2) & (randomDf[0]<2)]

# randomDf.iloc[2,2]

# df1.loc[0,'City']

# df1.loc[0,1]

# df1.iloc[0, 1]


#Iterate over DF
df = pd.DataFrame({
    'Name':['Raj','Alice','Bob'],
    'Age':[20,45,34]
})

# dd

# for index, row in df.iterrows():
#     print(f"{index} {row['Name']} {row['Age']}")


#Reshaping
data = pd.DataFrame({
    'Date':['2025-02','2025-02','2025-02'],
    'City':['Delhi','Mumbai','Pune'],
    'Temp':[40,35,43]
})
data

#Reshaped the Data
data.pivot(index='Date',columns='City',values='Temp')



#Concat
df1 = pd.DataFrame({'A':['A0','A1'], 'B':['B0','B1']})
df2 = pd.DataFrame({'A':['A2','A3'], 'B':['B2','B3']})

df1
df2

pd.concat([df1,df2])

pd.concat([df1,df2], axis = 1)

#Merge
left = pd.DataFrame({'ID':[1,2], 'Name':['Fardin','Sudarshan']})
right = pd.DataFrame({'ID':[1,2], 'Score':[85,92]})

pd.concat([left,right], axis=1) #Not Good


pd.merge(left,right, on='ID')

Unnamed: 0,ID,Name,Score
0,1,Fardin,85
1,2,Sudarshan,92


In [21]:
# GroupBy
df = pd.DataFrame({
    'Department': ['IT', 'HR','IT', 'HR'],
    'Employees': ['A', 'B', 'C', 'D'],
    'Salary': [60000, 50000, 65000, 52000]
})

In [23]:
df

Unnamed: 0,Department,Employees,Salary
0,IT,A,60000
1,HR,B,50000
2,IT,C,65000
3,HR,D,52000


In [29]:
df.groupby('Department')['Salary'].mean()

Department
HR    51000.0
IT    62500.0
Name: Salary, dtype: float64

In [31]:
df.groupby('Department')['Salary'].max()

Department
HR    52000
IT    65000
Name: Salary, dtype: int64

In [33]:
df.groupby('Department')['Salary'].sum()

Department
HR    102000
IT    125000
Name: Salary, dtype: int64

In [37]:
# handle missing data
df = pd.DataFrame({
    'A': [1,2, None],
    'B': [None, 5,6]
})

In [39]:
df

Unnamed: 0,A,B
0,1.0,
1,2.0,5.0
2,,6.0


In [41]:
df.fillna(0)

Unnamed: 0,A,B
0,1.0,0.0
1,2.0,5.0
2,0.0,6.0


In [43]:
df.fillna(2)

Unnamed: 0,A,B
0,1.0,2.0
1,2.0,5.0
2,2.0,6.0


In [45]:
df.fillna(500)

Unnamed: 0,A,B
0,1.0,500.0
1,2.0,5.0
2,500.0,6.0


In [47]:
# Sorting a DF
df = pd.DataFrame({
    'Name': ['Charlie', 'Alice', 'Bob'],
    'Age': [35, 25, 30]
})

In [49]:
df

Unnamed: 0,Name,Age
0,Charlie,35
1,Alice,25
2,Bob,30


In [51]:
df.sort_values(by='Age')

Unnamed: 0,Name,Age
1,Alice,25
2,Bob,30
0,Charlie,35


In [53]:
df.sort_values(by='Age',  ascending=False)

Unnamed: 0,Name,Age
0,Charlie,35
2,Bob,30
1,Alice,25
