# PANDAS

In [1]:
import pandas as pd
import numpy as np  # Often used together with pandas

In [2]:
# Create a Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print("Series:")
print(s)

# Create DataFrame from dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'London', 'Paris']
}
df = pd.DataFrame(data)
print("\nDataFrame:")
print(df)

Series:
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

DataFrame:
      Name  Age      City
0    Alice   25  New York
1      Bob   30    London
2  Charlie   35     Paris


In [3]:
# Create sample data
df.to_csv('sample_data.csv', index=False)

# Read from CSV
csv_df = pd.read_csv('train.csv')
print("\nCSV DataFrame:")
print(csv_df)

# Read from Excel (requires openpyxl)
# df.to_excel('sample_data.xlsx', index=False)
# excel_df = pd.read_excel('sample_data.xlsx')


CSV DataFrame:
     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ..

In [4]:
# View first rows
print("\nFirst 2 rows:")
print(df.head(2))

# Basic information
print("\nDataFrame info:")
print(df.info())

# Descriptive statistics
print("\nDescriptive stats:")
print(df.describe())

# Columns and shape
print("\nColumns:", df.columns)
print("Shape:", df.shape)


First 2 rows:
    Name  Age      City
0  Alice   25  New York
1    Bob   30    London

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   City    3 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes
None

Descriptive stats:
        Age
count   3.0
mean   30.0
std     5.0
min    25.0
25%    27.5
50%    30.0
75%    32.5
max    35.0

Columns: Index(['Name', 'Age', 'City'], dtype='object')
Shape: (3, 3)


In [5]:
# Column selection
print("\nAge column:")
print(df['Age'])

# Multiple columns
print("\nName and City:")
print(df[['Name', 'City']])

# Row selection by index
print("\nRow 1:")
print(df.iloc[1])

# Boolean indexing
print("\nPeople over 28:")
print(df[df['Age'] > 28])


Age column:
0    25
1    30
2    35
Name: Age, dtype: int64

Name and City:
      Name      City
0    Alice  New York
1      Bob    London
2  Charlie     Paris

Row 1:
Name       Bob
Age         30
City    London
Name: 1, dtype: object

People over 28:
      Name  Age    City
1      Bob   30  London
2  Charlie   35   Paris


In [6]:
# Handle missing data
df_nan = df.copy()
df_nan.loc[1, 'Age'] = np.nan
print("\nData with missing values:")
print(df_nan)

print("\nFill missing values:")
print(df_nan.fillna({'Age': df_nan['Age'].mean()}))

# Drop duplicates
df_dup = pd.concat([df, df]).reset_index(drop=True)
print("\nDropped duplicates:")
print(df_dup.drop_duplicates())


Data with missing values:
      Name   Age      City
0    Alice  25.0  New York
1      Bob   NaN    London
2  Charlie  35.0     Paris

Fill missing values:
      Name   Age      City
0    Alice  25.0  New York
1      Bob  30.0    London
2  Charlie  35.0     Paris

Dropped duplicates:
      Name  Age      City
0    Alice   25  New York
1      Bob   30    London
2  Charlie   35     Paris


In [7]:
# Add new column
df['Senior'] = df['Age'] > 30
print("\nDataFrame with Senior column:")
print(df)

# Rename columns
df_renamed = df.rename(columns={'City': 'Location'})
print("\nRenamed columns:")
print(df_renamed)

# Sort values
print("\nSorted by Age:")
print(df.sort_values('Age', ascending=False))


DataFrame with Senior column:
      Name  Age      City  Senior
0    Alice   25  New York   False
1      Bob   30    London   False
2  Charlie   35     Paris    True

Renamed columns:
      Name  Age  Location  Senior
0    Alice   25  New York   False
1      Bob   30    London   False
2  Charlie   35     Paris    True

Sorted by Age:
      Name  Age      City  Senior
2  Charlie   35     Paris    True
1      Bob   30    London   False
0    Alice   25  New York   False


In [8]:
# Group by city
print("\nAverage age by city:")
print(df.groupby('City')['Age'].mean())

# Pivot tables
print("\nPivot table:")
print(pd.pivot_table(df, values='Age', index='City', aggfunc=np.mean))


Average age by city:
City
London      30.0
New York    25.0
Paris       35.0
Name: Age, dtype: float64

Pivot table:
           Age
City          
London    30.0
New York  25.0
Paris     35.0


  print(pd.pivot_table(df, values='Age', index='City', aggfunc=np.mean))


In [9]:
# Apply function to column
df['Age_squared'] = df['Age'].apply(lambda x: x**2)
print("\nApply function:")
print(df)

# Vectorized operations
df['Age_plus_5'] = df['Age'] + 5
print("\nVectorized operation:")
print(df)


Apply function:
      Name  Age      City  Senior  Age_squared
0    Alice   25  New York   False          625
1      Bob   30    London   False          900
2  Charlie   35     Paris    True         1225

Vectorized operation:
      Name  Age      City  Senior  Age_squared  Age_plus_5
0    Alice   25  New York   False          625          30
1      Bob   30    London   False          900          35
2  Charlie   35     Paris    True         1225          40


In [10]:
# Multi-index
arrays = [['A', 'A', 'B', 'B'], [1, 2, 1, 2]]
multi_index = pd.MultiIndex.from_arrays(arrays, names=('Letters', 'Numbers'))
multi_df = pd.DataFrame({'Values': [10, 20, 30, 40]}, index=multi_index)
print("\nMulti-index DataFrame:")
print(multi_df)

# Stack/Unstack
print("\nUnstacked DataFrame:")
print(multi_df.unstack())


Multi-index DataFrame:
                 Values
Letters Numbers        
A       1            10
        2            20
B       1            30
        2            40

Unstacked DataFrame:
        Values    
Numbers      1   2
Letters           
A           10  20
B           30  40


In [12]:
# Chunk processing
chunk_size = 1000
chunk_reader = pd.read_csv('train.csv', chunksize=chunk_size)

for chunk in chunk_reader:
    # Process each chunk here
    print(f"Processing chunk of shape {chunk.shape}")
    break  # Remove break to process all chunks

Processing chunk of shape (891, 12)


In [13]:
# Convert to categorical
df['City'] = df['City'].astype('category')
print("\nCategorical data:")
print(df.dtypes)


Categorical data:
Name             object
Age               int64
City           category
Senior             bool
Age_squared       int64
Age_plus_5        int64
dtype: object


In [14]:
# Final Tips
print("""
Practice these operations:
1. Filtering and querying data
2. Handling missing values
3. Merging/joining datasets
4. GroupBy operations
5. Time series manipulation
6. Data visualization
""")


Practice these operations:
1. Filtering and querying data
2. Handling missing values
3. Merging/joining datasets
4. GroupBy operations
5. Time series manipulation
6. Data visualization

