In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#Creating Data Structures
Pandas provides two primary data structures:

1. Series: 1D labeled array
2. DataFrame: 2D labeled table

In [3]:

data = [10, 20, 30]
series = pd.Series(data, index=['a', 'b', 'c'])
print("Series:\n", series)


# Create DataFrame
data = {'Name': ['Alice', 'Bob'], 'Age': [25, 30]}
df = pd.DataFrame(data)
print("DataFrame:\n", df)



Series:
 a    10
b    20
c    30
dtype: int64
DataFrame:
     Name  Age
0  Alice   25
1    Bob   30


In [4]:
# Access Column
print("Name Column:\n", df['Name'])

# Access Row
print("First Row:\n", df.iloc[0])

# Summary Statistics
print("Statistics:\n", df.describe())


Name Column:
 0    Alice
1      Bob
Name: Name, dtype: object
First Row:
 Name    Alice
Age        25
Name: 0, dtype: object
Statistics:
              Age
count   2.000000
mean   27.500000
std     3.535534
min    25.000000
25%    26.250000
50%    27.500000
75%    28.750000
max    30.000000


#Data Selection and Filtering

In [5]:
# DataFrame Selection
data = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35]}
df = pd.DataFrame(data)

# Filter Rows
filtered = df[df['Age'] > 25]
print("Filtered Rows:\n", filtered)

# Select Using loc
selected = df.loc[0:1, ['Name', 'Age']]  # Rows 0-1, specific columns
print("Selection with loc:\n", selected)


Filtered Rows:
       Name  Age
1      Bob   30
2  Charlie   35
Selection with loc:
     Name  Age
0  Alice   25
1    Bob   30


# Missing Data Handling

In [7]:
data = {'Name': ['Alice', 'Bob', None], 'Age': [25, None, 35]}
df = pd.DataFrame(data)

# Fill Missing Values
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Drop Missing Rows
df.dropna(inplace=True)

print("Handled Missing Data:\n", df)


Handled Missing Data:
     Name   Age
0  Alice  25.0
1    Bob  30.0


#Grouping and Aggregation

In [8]:
data = {'Name': ['Alice', 'Bob', 'Alice'], 'Sales': [100, 200, 150]}
df = pd.DataFrame(data)

# Group and Aggregate
grouped = df.groupby('Name')['Sales'].sum()
print("Grouped Data:\n", grouped)


Grouped Data:
 Name
Alice    250
Bob      200
Name: Sales, dtype: int64


#DataFrame Merging, Joining, and Concatenation

In [10]:
df1 = pd.DataFrame({'ID': [1, 2], 'Name': ['Alice', 'Bob']})
df2 = pd.DataFrame({'ID': [1, 2], 'Age': [25, 30]})

merged = pd.merge(df1, df2, on='ID')
print("Merged DataFrame:\n", merged)


# Concatenate:
df1 = pd.DataFrame({'Name': ['Alice'], 'Age': [25]})
df2 = pd.DataFrame({'Name': ['Bob'], 'Age': [30]})

concat = pd.concat([df1, df2])
print("Concatenated DataFrame:\n", concat)


Merged DataFrame:
    ID   Name  Age
0   1  Alice   25
1   2    Bob   30
Concatenated DataFrame:
     Name  Age
0  Alice   25
0    Bob   30


#MultiIndex (Hierarchical Indexing)

In [11]:
arrays = [['A', 'A', 'B', 'B'], [1, 2, 1, 2]]
index = pd.MultiIndex.from_arrays(arrays, names=('Group', 'Number'))
data = [10, 20, 30, 40]
df = pd.DataFrame(data, index=index, columns=['Value'])

print("MultiIndex DataFrame:\n", df)


MultiIndex DataFrame:
               Value
Group Number       
A     1          10
      2          20
B     1          30
      2          40


#Pivot Tables

In [12]:
data = {'Name': ['Alice', 'Bob', 'Alice'], 'Month': ['Jan', 'Jan', 'Feb'], 'Sales': [100, 200, 150]}
df = pd.DataFrame(data)

pivot = df.pivot_table(values='Sales', index='Month', columns='Name', aggfunc='sum', fill_value=0)
print("Pivot Table:\n", pivot)


Pivot Table:
 Name   Alice  Bob
Month            
Feb      150    0
Jan      100  200


#Window Functions

In [13]:
data = {'Value': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

# Rolling Mean
df['Rolling_Mean'] = df['Value'].rolling(window=3).mean()
print("Rolling Mean:\n", df)


Rolling Mean:
    Value  Rolling_Mean
0     10           NaN
1     20           NaN
2     30          20.0
3     40          30.0
4     50          40.0


#Time-Series Data

In [14]:
# Generate Date Range
date_rng = pd.date_range(start='2023-01-01', end='2023-01-10', freq='D')

# Create Time Series DataFrame
df = pd.DataFrame({'Date': date_rng, 'Value': range(len(date_rng))})

# Set Date as Index
df.set_index('Date', inplace=True)
print("Time Series DataFrame:\n", df)


Time Series DataFrame:
             Value
Date             
2023-01-01      0
2023-01-02      1
2023-01-03      2
2023-01-04      3
2023-01-05      4
2023-01-06      5
2023-01-07      6
2023-01-08      7
2023-01-09      8
2023-01-10      9


#Custom Aggregations

In [15]:
data = {'Name': ['Alice', 'Bob'], 'Sales': [100, 200]}
df = pd.DataFrame(data)

# Apply Custom Function
df['Bonus'] = df['Sales'].apply(lambda x: x * 0.1)
print("Custom Aggregation:\n", df)


Custom Aggregation:
     Name  Sales  Bonus
0  Alice    100   10.0
1    Bob    200   20.0


#Exploding Columns

In [16]:
data = {'Name': ['Alice', 'Bob'], 'Hobbies': [['Reading', 'Swimming'], ['Cooking']]}
df = pd.DataFrame(data)

# Explode the List Column
exploded = df.explode('Hobbies')
print("Exploded DataFrame:\n", exploded)


Exploded DataFrame:
     Name   Hobbies
0  Alice   Reading
0  Alice  Swimming
1    Bob   Cooking


#Categorical Data

In [17]:
data = {'City': ['New York', 'London', 'Paris', 'New York']}
df = pd.DataFrame(data)

# Convert to Categorical
df['City'] = df['City'].astype('category')
print("Categorical Column Info:\n", df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   City    4 non-null      category
dtypes: category(1)
memory usage: 264.0 bytes
Categorical Column Info:
 None
