# DataFrame

In [2]:
# A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

# A DataFrame in Pandas is a two-dimensional, tabular data structure (like an Excel spreadsheet or SQL table). 
# It consists of rows and columns and can hold multiple data types (integers, floats, strings, etc.).

# To create the dataframe we have to use DataFrame method of pandas.

In [3]:
# what is Dataframe method of pandas ?

# DataFrame() Method in Pandas
# The DataFrame() method in Pandas is used to create a DataFrame (a 2D table-like data structure) from various data sources like lists, dictionaries, NumPy arrays, and more.

# Return Type
# The return type of pd.DataFrame() is a Pandas DataFrame object.

In [4]:
#  What Does DataFrame() Do?

# It creates a DataFrame from input data.
# It organizes data into rows and columns.
# It assigns default or custom indexes.
# It labels columns automatically if provided.

# Syntax

In [5]:

# import pandas as pd

# pd.DataFrame(data, index, columns, dtype, copy)

# 🔸 Parameters:
# Parameter	Description
# data	Data to be converted (list, dict, array, etc.).
# index	(Optional) Custom row labels.
# columns	(Optional) Custom column names.
# dtype	(Optional) Data type of values.
# copy	(Optional) Copy data or not. Default is False.


In [9]:
# Creating the dataframe object from dictionary.

import pandas as pd
d1 = {
    'Name':['GOGO','KOKO','LOLO'],
    'age':[10,20,30]
}
df = pd.DataFrame(d1)
print(df)
print(type(df))

# In this key becomes the column names and values becomes the row.

   Name  age
0  GOGO   10
1  KOKO   20
2  LOLO   30
<class 'pandas.core.frame.DataFrame'>


# I. index Parameter

In [11]:
# index Parameter (Custom Row Indexing)
# By default, Pandas assigns numeric row indices (0,1,2,...). You can specify custom row labels using the index parameter.

d1 = {
    'Name':['GOGO','KOKO','LOLO'],
    'age':[10,20,30]
     }
df = pd.DataFrame(d1,index=['row1','row2','row3'])
print(df)

# Here by providing the index parameter we have customize the row name.

      Name  age
row1  GOGO   10
row2  KOKO   20
row3  LOLO   30


# II. Column Parameter

In [20]:
# columns Parameter (Custom Column Names)
# By default, column names are inferred from dictionary keys or input structure. You can change them using columns.
d1 = {
    'Name':['GOGO','KOKO','LOLO'],
    'age':[10,20,30]
}
df = pd.DataFrame(d1,index=['row1','row2','row3'],columns=('A','B'))
print(df)

# The issue here is that the columns parameter does not match the keys in the dictionary (d1). 
# As a result, Pandas creates a DataFrame with column names "A" and "B", which do not exist in the dictionary, so it fills all values with NaN.

# Why Does This Happen?
# Your dictionary d1 has keys "Name" and "age".
# You specified columns=['A', 'B'], which are not present in the dictionary.
# Since Pandas doesn't find "A" and "B" in d1, it creates empty columns filled with NaN.


        A    B
row1  NaN  NaN
row2  NaN  NaN
row3  NaN  NaN


In [22]:
# Alternate way to change the column name of the dictionary with new names provided by the column parameter.
d1 = {
    'Name':['GOGO','KOKO','LOLO'],
    'age':[10,20,30]
}
df = pd.DataFrame(d1)
print(df)

df.columns = ['X','Y']
print(df)

   Name  age
0  GOGO   10
1  KOKO   20
2  LOLO   30
      X   Y
0  GOGO  10
1  KOKO  20
2  LOLO  30


In [23]:
# Why Does columns in pd.DataFrame() Lead to NaN?
# When you specify the columns parameter inside pd.DataFrame(), Pandas tries to match the given column names with the dictionary keys. 
# If they don’t match, Pandas creates a DataFrame with the specified columns but fills it with NaN because it cannot find corresponding values.

f = pd.DataFrame(d1, index=['row1', 'row2', 'row3'], columns=['A', 'B'])
print(df)

# Why NaN?

# d1 = {'Name': ['GOGO', 'KOKO', 'LOLO'], 'age': [10, 20, 30]}
# Pandas expects keys like "A" and "B", but only finds "Name" and "age".
# Since "A" and "B" are missing in d1, Pandas fills them with NaN.



      X   Y
0  GOGO  10
1  KOKO  20
2  LOLO  30


In [25]:
# Why Does Renaming df.columns After Creation Work?
# When you first create the DataFrame without the columns parameter, 
# Pandas correctly assigns column names based on the dictionary keys. 
# Then, when you manually rename the columns after creation, you're just changing the labels, not affecting the actual data.

# Example (Works Correctly)

df = pd.DataFrame(d1, index=['row1', 'row2', 'row3'])
df.columns = ['A', 'B']  # Renaming columns after creation
print(df)

# Why Does This Work?

# First, Pandas correctly maps "Name" and "age" to columns.
# Then, df.columns = ['A', 'B'] simply renames them without affecting the data.
# Column renaming does not change the structure of the DataFrame, but using columns= inside pd.DataFrame() expects exact column matches.

         A   B
row1  GOGO  10
row2  KOKO  20
row3  LOLO  30


In [None]:
# What Does "Changing the Labels, Not Affecting the Actual Data" Mean?
# When you rename columns after creating a DataFrame, only the column names (labels) change, but the actual data inside the DataFrame remains the same.

In [26]:
# Why Is This Different from Using columns= in pd.DataFrame()?
# If you specify column names in pd.DataFrame() that don’t match the dictionary keys, Pandas creates a DataFrame with those names but cannot map data to them, resulting in NaN values.

df.columns = ['A', 'B']  # Changing column names
print(df)

# What happened here?
# The column labels ("Name" → "A", "Age" → "B") were changed.
# The data remains the same: "GOGO", "KOKO", "LOLO" are still in the first column, and 10, 20, 30 are still in the second column.

         A   B
row1  GOGO  10
row2  KOKO  20
row3  LOLO  30


# IV. dtype

In [28]:
# dtype Parameter (Set Data Type for All Columns).
# By default, Pandas infers the data type of each column. You can enforce a specific type using dtype.

d1 = {
    'a':[10,20,30],
    'b':[10,20,30]
}
df = pd.DataFrame(d1,index=['i','ii','iii'],dtype='float')
print(df)
# All values are converted to float.

        a     b
i    10.0  10.0
ii   20.0  20.0
iii  30.0  30.0


# V. copy Parameter (Control Copying of Data)


In [55]:
# Default (False) → Uses reference if possible (saves memory).
# True → Forces a copy of the input data.

# Copy true means if somes changes is done in the dataframe object the changes will be reflected into the original data structure also.
d1 = {
    'a':[10,20,30],
    'b':[100,200,300]
}
df1 = pd.DataFrame(d1,copy=True)
df1.loc[0,'a'] = 100
print("Original Data",d1)
print("Modified Data",df1)


# When to Use copy=True?
# When you want to modify a DataFrame without affecting the original data.
# When working with NumPy arrays or slices of other DataFrames, to avoid unexpected changes.

Original Data {'a': [10, 20, 30], 'b': [100, 200, 300]}
Modified Data      a    b
0  100  100
1   20  200
2   30  300


# A.Creating the dataframe from the list

# 1.Creating Dataframe from List

In [62]:
# A single list will create a single-column DataFrame.
l1 = [10,20,30,40,50]
df = pd.DataFrame(l1,index=['a','b','c','d','e'],columns=["Values"])
print(df)

# The list elements become rows.
# The columns parameter gives the column a name.


   Values
a      10
b      20
c      30
d      40
e      50


# 2. Creating a DataFrame from a List of Lists

In [64]:
# A list of lists creates a multi-column DataFrame.

data = [
    [1,'Alice',25],
    [2,'BOB',24],
    [3,'Charlie',28]
    ]
df = pd.DataFrame(data,columns=['id','Name',"Age"])
print(df)

# Here the inner list becomes the individual row and columns parameter provide the column name.
# Each inner list represents a row.
# The columns parameter assigns column names.

   id     Name  Age
0   1    Alice   25
1   2      BOB   24
2   3  Charlie   28


# 3.Creating a DataFrame from a List of Dictionaries

In [66]:
data = [
    {'id':1,'Name':'Alice','age':25},
    {'id':2,'Name':'Bob','age':24},
    {'id':3,'Name':"Charlie",'age':28}
]
df = pd.DataFrame(data)
print(df)

# Each dictionary represents a row.

   id     Name  age
0   1    Alice   25
1   2      Bob   24
2   3  Charlie   28


# B.Creating the DataFrame from the Tuple.

# 1.DataFame from the tuple

In [73]:
t1 = (10,20,30,40)
df = pd.DataFrame(t1,columns=['Values'],index=['a','d','d','e'])
print(df)

# Here the tuple individual elements become row  of the DataFrame and column parameter provides the name of the column in the dataframe.

   Values
a      10
d      20
d      30
e      40


# 2.DataFrame from the list of tuple

In [74]:
data = [
    (1,'Alice',25),
    (2,'BOB',24),
    (3,'Charlie',28)
]

df = pd.DataFrame(data,columns=['Id','Name','Age'])
print(df)

# Each tuple represents a row.
# The columns parameter is used to assign column names.

   Id     Name  Age
0   1    Alice   25
1   2      BOB   24
2   3  Charlie   28


# 3.Creating a DataFrame from a Tuple of Lists

In [78]:
data = ([1,2,3],
        ['Alice',"BOB",'Charlie'],
       [19,29,19])
df = pd.DataFrame(data,columns=['id','Name','Age'])
print(df)

      id Name      Age
0      1    2        3
1  Alice  BOB  Charlie
2     19   29       19


# 4. Creating a DataFrame from a Dictionary of Tuples

In [80]:
d1 = {
    'id':(1,2,3),
    'Name':('Alice','Bob','Charlie'),
    'Age':(10,20,30)
}
df = pd.DataFrame(d1)
print(df)

# Each key in the dictionary is a column name.
# The tuple values become column values.

   id     Name  Age
0   1    Alice   10
1   2      Bob   20
2   3  Charlie   30


# 5.Creating a DataFrame from tuples of tuples

In [82]:
data = ((1,'Alice','10'),(2,'Bob',30),(3,'Charlie',30))
df = pd.DataFrame(data,columns=['id','Name','Age'])
print(df)

   id     Name Age
0   1    Alice  10
1   2      Bob  30
2   3  Charlie  30


# ------------------------------Indexing and Slicing--------------------------------------

In [2]:
# Indexing and slicing in Pandas allow you to access, modify, and extract specific portions of data from a DataFrame. 
# Pandas provides multiple ways to index and slice a DataFrame.

# I. Accessing Data in a DataFrame

In [3]:
# A DataFrame consists of rows (index) and columns (column labels). You can access data using:

# 1.Column labels
# 2.Row labels or index numbers
# 3.Boolean conditions

In [4]:
# Accessing the data using Columns

In [7]:
import pandas as pd

data = {
    'Name':['ALice','Bob','Charlie'],
    'Age':[10,20,30],
    'Salary':[12,15,16]
}
df = pd.DataFrame(data,index=['row1','row2','row3'])
print(df)

         Name  Age  Salary
row1    ALice   10      12
row2      Bob   20      15
row3  Charlie   30      16


In [9]:
print(df.Name)
print(type(df.Name))
# Here we have selected a single column it will return a single column which is a Series.

row1      ALice
row2        Bob
row3    Charlie
Name: Name, dtype: object
<class 'pandas.core.series.Series'>


In [28]:
# col1 = df.Name
print(col1)

row1      ALice
row2        Bob
row3    Charlie
Name: Name, dtype: object


In [20]:
# Accessing the Multiple Columns

In [27]:
# Here we are Selecting multiple columns
df[['Name','Age']]

# Here, ['Name', 'Salary'] is a list of column names.
# df[...] expects a list when selecting multiple columns.
# The double brackets [[...]] indicate that we are passing a list to select multiple columns.
# The result is a DataFrame, not a Series.

Unnamed: 0,Name,Age
row1,ALice,10
row2,Bob,20
row3,Charlie,30


# II.Indexing Using .loc[] (Label-based Indexing)

In [30]:
# .loc[] is label-based indexing, meaning you use row labels and column labels to access data.
# In method start and stop are included.

# Selecting a Single Row by Label

In [31]:
data = {
    'Name':['ALice','Bob','Charlie'],
    'Age':[10,20,30],
    'Salary':[12,15,16]
}
df = pd.DataFrame(data,index=['row1','row2','row3'])
print(df)

         Name  Age  Salary
row1    ALice   10      12
row2      Bob   20      15
row3  Charlie   30      16


In [39]:
# The return type is a Series.
# Here the columns name becomes indexes When extracting a single column from the dataframe.
print(df.loc['row1'])
print(type(df.loc['row1']))

Name      ALice
Age          10
Salary       12
Name: row1, dtype: object
<class 'pandas.core.series.Series'>


# Selecting Multiple Rows

In [41]:
# Accessing the multiple rows using row labels
df.loc[['row1','row2']]

Unnamed: 0,Name,Age,Salary
row1,ALice,10,12
row2,Bob,20,15


# Selecting Specific Rows and Columns

In [47]:
# Here the first parameter must be row label and second one is column label.
print(df.loc['row1','Name'])

ALice


In [51]:
# Here we have slicing the rows and extracting the data of specify columns.
print(df.loc['row1':'row3',['Name','Age']])
print(type(df.loc['row1':'row3',['Name','Age']]))

         Name  Age
row1    ALice   10
row2      Bob   20
row3  Charlie   30
<class 'pandas.core.frame.DataFrame'>


# III.Indexing Using .iloc[] (Integer-based Indexing)

In [116]:
# .iloc[] is position-based indexing, meaning you use integer indices to access data.
# It works on row slicing and extraction.

# Selecting a Single Row

In [53]:
data = {
    'Name':['ALice','Bob','Charlie'],
    'Age':[10,20,30],
    'Salary':[12,15,16]
}
df = pd.DataFrame(data)
print(df)

      Name  Age  Salary
0    ALice   10      12
1      Bob   20      15
2  Charlie   30      16


In [55]:
# Here it return the Series.
print(df.iloc[0])

Name      ALice
Age          10
Salary       12
Name: 0, dtype: object


# Selecting Multiple Rows



In [68]:
# Here the start is included and stop is excluded
print(df.iloc[0:2])

    Name  Age  Salary
0  ALice   10      12
1    Bob   20      15


# Selecting Specific Rows and Columns

In [69]:
# If we pass the index greather than the index of the rows then bydefault the stop is replace with last index row of the dataframe.
print(df.iloc[0:10])

      Name  Age  Salary
0    ALice   10      12
1      Bob   20      15
2  Charlie   30      16


In [74]:
# Here the columns are index from 0 to upto to end....
print(df.iloc[0:3,0:1])

      Name
0    ALice
1      Bob
2  Charlie


# IV.Conditional Indexing (Boolean Masking)

In [82]:
# You can filter rows using Boolean conditions.
# It works on row selection.

# Selecting Rows Based on a Condition

In [75]:
data = {
    'Name':['ALice','Bob','Charlie'],
    'Age':[10,20,30],
    'Salary':[12,15,16]
}
df = pd.DataFrame(data)
print(df)

      Name  Age  Salary
0    ALice   10      12
1      Bob   20      15
2  Charlie   30      16


In [80]:
# Here we will get the Series with boolean values.
# It compare each value of Age column with 5
# It return true if the condition is met.
df.Age>5

0    True
1    True
2    True
Name: Age, dtype: bool

In [81]:
# Now, we pass this Boolean Series inside df[]:
df[df.Age>5]
# Pandas keeps only the rows where the Boolean Series is True.
# It returns the entire row, not just the "Age" column.

# Why Does This Work?
# df[boolean_series] performs row-wise selection.
# The Boolean Series acts as a mask, selecting rows where True.

Unnamed: 0,Name,Age,Salary
0,ALice,10,12
1,Bob,20,15
2,Charlie,30,16


# Selecting Rows with Multiple Conditions

In [90]:
# df['Age']>5 & df['Salary']>5
# The error occurs because of the incorrect use of the bitwise AND (&) operator in the expression:
# This is incorrect because 5 & df['Salary'] is not a valid operation.

In [96]:
# The expression (df['Age'] > 5) & (df['Salary'] > 5) returns a Boolean Series,
# where each value is True if both conditions are met for that row, and False otherwise.
(df['Age']>5) & (df['Salary']>5)
print(type((df['Age']>5) & (df['Salary']>5)))

<class 'pandas.core.series.Series'>


In [98]:
# Here we will get dataframe which satisfy the below two conditions.
df[(df['Age']>5) & (df['Salary']>5)]

Unnamed: 0,Name,Age,Salary
0,ALice,10,12
1,Bob,20,15
2,Charlie,30,16


# V. Slicing Rows Using : Operator

In [99]:
# Just like lists, Pandas allows row slicing using :

# Selecting Rows

In [101]:
data = {
    'Name':['ALice','Bob','Charlie'],
    'Age':[10,20,30],
    'Salary':[12,15,16]
}
df = pd.DataFrame(data)
print(df)

      Name  Age  Salary
0    ALice   10      12
1      Bob   20      15
2  Charlie   30      16


In [104]:
# Here it will return all the data. 
df[0:]

Unnamed: 0,Name,Age,Salary
0,ALice,10,12
1,Bob,20,15
2,Charlie,30,16


In [106]:
df[:10]

Unnamed: 0,Name,Age,Salary
0,ALice,10,12
1,Bob,20,15
2,Charlie,30,16


In [108]:
# Stop is an exclusive one.
df[0:1]

Unnamed: 0,Name,Age,Salary
0,ALice,10,12


In [109]:
df.index = ['a','b','c']
print(df)

      Name  Age  Salary
a    ALice   10      12
b      Bob   20      15
c  Charlie   30      16


In [110]:
df['a':]

Unnamed: 0,Name,Age,Salary
a,ALice,10,12
b,Bob,20,15
c,Charlie,30,16


In [111]:
df['b':]

Unnamed: 0,Name,Age,Salary
b,Bob,20,15
c,Charlie,30,16


In [114]:
df['Name':'Salary']
# The expression df['Name':'Salary'] returns an empty DataFrame because label-based slicing on columns does not work this way in pandas.

# Understanding the Issue
# In pandas, when slicing using labels (like 'Name':'Salary'), it works only for row labels (index), not column labels.

# Incorrect Usage:
# df['Name':'Salary']  # Does NOT work for selecting columns
# This will return an empty DataFrame because pandas treats 'Name':'Salary' as an attempt to slice row index labels, but "Name" and "Salary" are column names, not row labels.

Unnamed: 0,Name,Age,Salary


In [115]:
df['Name':'Salary']
# When using a range (:), pandas interprets it as row slicing, NOT column slicing.
# Pandas tries to find rows with labels 'Name' to 'Salary'.
# Since the index (row labels) don't contain 'Name' or 'Salary', it returns an empty DataFrame.

Unnamed: 0,Name,Age,Salary
