In [None]:
Pandas for Data Science:

In [None]:
>Pandas is the most popular Python library for data manipulation and analysis.
>It provides powerful tools for working with structured data, making it essential for
 data science, machine learning, and analytics.

In [None]:
1. Why Use Pandas?
 >Handles tabular data (like Excel but more powerful)
 >Works with missing data gracefully
 >Powerful grouping and aggregation functions
 >Easy merging/joining of datasets
 >Time series functionality
 >Integrates well with NumPy, Matplotlib, and scikit-learn

In [None]:
2. Core Pandas Data Structures

In [None]:
Series (1D Data)

In [1]:
import pandas as pd

# Create a Series from a list
s = pd.Series([1, 3, 5, 7, 9])
print(s)

0    1
1    3
2    5
3    7
4    9
dtype: int64


In [None]:
DataFrame (2D Data)

In [2]:
# Create DataFrame from dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['NY', 'Paris', 'London']
}
df = pd.DataFrame(data)
print(df)

      Name  Age    City
0    Alice   25      NY
1      Bob   30   Paris
2  Charlie   35  London


In [None]:
3. Essential DataFrame Operations

In [None]:
Viewing Data

In [3]:
df.head()      # First 5 rows
df.tail(3)     # Last 3 rows
df.sample(2)   # Random 2 rows
df.shape       # (rows, columns)
df.info()      # Data types & memory
df.describe()  # Statistical summary

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   City    3 non-null      object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes


Unnamed: 0,Age
count,3.0
mean,30.0
std,5.0
min,25.0
25%,27.5
50%,30.0
75%,32.5
max,35.0


In [None]:
Selection & Indexing

In [5]:
# Select columns
df['Name']        # Single column
df[['Name', 'Age']]  # Multiple columns

# Select rows
df.iloc[0]        # First row by position
df.loc[0]         # First row by index
df.iloc[1:3]      # Rows 1-2
df[df['Age'] > 28]  # Filter rows

Unnamed: 0,Name,Age,City
1,Bob,30.0,Paris
2,Charlie,35.0,London


In [None]:
Data Cleaning

In [6]:
# Handle missing data
df.dropna()      # Remove rows with missing values
df.fillna(0)     # Replace missing with 0

# Remove duplicates
df.drop_duplicates()

# Rename columns
df.rename(columns={'Age': 'Years'})

# Change data types
df['Age'] = df['Age'].astype(float)

In [None]:
4. Data Analysis with Pandas

In [None]:
Basic Statistics

In [7]:
df['Age'].mean()    # Average age
df['Age'].max()     # Maximum age
df['Age'].min()     # Minimum age
df['Age'].std()     # Standard deviation
df['Age'].value_counts()  # Frequency count

Age
25.0    1
30.0    1
35.0    1
Name: count, dtype: int64

In [None]:
Grouping & Aggregation

In [8]:
# Group by city and calculate mean age
df.groupby('City')['Age'].mean()

# Multiple aggregations
df.groupby('City').agg({
    'Age': ['mean', 'min', 'max'],
    'Name': 'count'
})

Unnamed: 0_level_0,Age,Age,Age,Name
Unnamed: 0_level_1,mean,min,max,count
City,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
London,35.0,35.0,35.0,1
NY,25.0,25.0,25.0,1
Paris,30.0,30.0,30.0,1


In [None]:
Sorting & Ranking

In [9]:
df.sort_values('Age')          # Ascending
df.sort_values('Age', ascending=False)  # Descending
df['Age'].rank()               # Rank values

0    1.0
1    2.0
2    3.0
Name: Age, dtype: float64

In [None]:
5. Combining DataFrames

In [None]:
Concatenation

In [None]:
Merging (SQL-like joins)

In [None]:
pd.merge(df1, df2, on='key')          # Inner join
pd.merge(df1, df2, on='key', how='left')   # Left join
pd.merge(df1, df2, on='key', how='outer')  # Full outer join

In [None]:
6. Working with Real Data

In [None]:
Reading/Writing Files

In [None]:
Handling Dates

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month_name()

In [None]:
7. Practical Example: Sales Data Analysis