# DataFrame in Pandas

In [2]:
# A DataFrame in Pandas is a two-dimensional, tabular data structure similar to an Excel spreadsheet or SQL table. 
# It consists of rows and columns, where:
# Rows represent different records (entries).
# Columns represent different attributes (fields).
# A DataFrame is built using the pandas.DataFrame() function and can hold different data types (integers, floats, strings, etc.) in different columns.

# pd.DataFrame()

In [3]:
# The pd.DataFrame() constructor is used to create a DataFrame in Pandas. 
# It allows converting lists, dictionaries, NumPy arrays, Series, etc., into tabular structures.

In [None]:
# Syntax:
# pd.DataFrame(data, index=None, columns=None, dtype=None, copy=False)

# Explanation of Parameters:\t
# Parameter	Description	Default Value
# data	The main data source, which can be a list, dictionary, NumPy array, Series, or another DataFrame.	Required
# index	Labels for the rows. If None, Pandas automatically assigns numeric indices (0, 1, 2, …).	None
# columns	Labels for the columns. If None, Pandas auto-generates column names when using dictionaries or structured data.	None
# dtype	Specifies the data type for all columns. If None, Pandas infers the type automatically.	None
# copy	If True, creates a copy of the input data instead of referencing it.	False

In [11]:
!pip install pandas 



# 1.Creating a DataFrame from a List

In [3]:
import pandas as pd
l1 = [[10,20,30],[40,50,60],[70,80,90]]
df = pd.DataFrame(l1)
print(df)

# The list contains 3 rows and 3 columns.
# Since no column names are provided, Pandas assigns default column labels (0, 1, 2).
# Row indices are automatically assigned (0, 1, 2).

    0   1   2
0  10  20  30
1  40  50  60
2  70  80  90


# 2.Creating a DataFrame with Custom Index and Columns

In [20]:
l1 = [[10,20,30],[40,50,60],[70,80,90]]
df = pd.DataFrame(l1,index=['Row1','Row2','Row3'],columns=['S','D','F'])
print(df)

# index=['Row1', 'Row2', 'Row3'] → Sets row labels.
# columns=['S', 'D', 'F'] → Sets column names.

       S   D   F
Row1  10  20  30
Row2  40  50  60
Row3  70  80  90


# 3.Using the dtype Parameter

In [28]:
l1 = [[10,20,30],[40,50,60],[70,80,90]]
df = pd.DataFrame(l1,dtype=float)
print(df)

# If the dataframe has string then it fails.

      0     1     2
0  10.0  20.0  30.0
1  40.0  50.0  60.0
2  70.0  80.0  90.0


# 4.Using the copy Parameter

In [43]:
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df2 = pd.DataFrame(df1, copy=False)  

df1.loc[0, 'A'] = 100

print(df2)

# If the input is already a Pandas object (like another DataFrame), 
# then copy=False might prevent a copy and retain the reference.

     A  B
0  100  4
1    2  5
2    3  6


# Data Structures That Can Be Used to Create a Pandas DataFrame

In [44]:
# A DataFrame in pandas can be created from multiple types of data structures like lists, dictionaries, NumPy arrays, and more. 
# Below is a detailed explanation with examples.

# 1.Creating a DataFrame from a Dictionary (dict)

In [46]:
d1 = {'Name':['Alice','GOGO','Jam'],'Age':[10,20,30]}
df = pd.DataFrame(d1)
print(df)
print(type(df))

# Each key represents a column, and each list contains the column values.

    Name  Age
0  Alice   10
1   GOGO   20
2    Jam   30
<class 'pandas.core.frame.DataFrame'>


# 2.Creating a DataFrame from a List of Lists (list)

In [48]:
l1 = [[10,20,30],[40,50,60],[70,80,90]]
df = pd.DataFrame(l1)
print(df)

# Without column names, Pandas assigns default numeric column labels.

    0   1   2
0  10  20  30
1  40  50  60
2  70  80  90


# 3.Creating a DataFrame from a List of Dictionaries (list of dicts)

In [49]:
l1=[
    {'Name':'Alice','Age':10},
    {'Name':'Jam','Age':20}
]
df = pd.DataFrame(l1)
print(df)

    Name  Age
0  Alice   10
1    Jam   20


# 4.Creating a DataFrame from a NumPy Array (numpy.ndarray)

In [51]:
import numpy as np
arr2 = np.array([[10,20,30],[40,50,60]])
df = pd.DataFrame(arr2,index=['r1','r2'],columns=['c1','c2','c3'])
print(df)

    c1  c2  c3
r1  10  20  30
r2  40  50  60


# 5.Creating a DataFrame from a Pandas Series (pd.Series)

In [64]:
# A Series can be converted to a DataFrame, either as a single-column DataFrame or multiple Series combined.
s1 = pd.Series([10,20,30],name='Numbers')
df = pd.DataFrame(s1,s1)
print(df)

# data = s1 → The values [10, 20, 30] from s1 are used as the data.
# index = s1 → The values [10, 20, 30] are used as the index (row labels).
# Column names are taken from s1 automatically, so "Numbers" becomes the column header.

         Numbers
Numbers         
10           NaN
20           NaN
30           NaN


In [65]:
# Step-by-step Explanation

# 1.Setting Row Index
# Since index=s1, the row labels become 10, 20, 30.

# 2.Setting Column Name
# Since s1 has a name "Numbers", Pandas keeps that as the column name.

# 3.Pandas is Expecting a Table Structure
# Normally, when you create a DataFrame, you have rows and columns with actual data.
# Here, because we used s1 for both data and index, Pandas is expecting to find values at row index 10, 20, 30.
# But there are no such values mapped to these row indices, so it fills everything with NaN.

# Properties of a Pandas DataFrame

In [1]:
# A DataFrame in Pandas has several properties (also called attributes) that help in understanding and manipulating it. 
# Below are the key properties of a DataFrame:

In [24]:
# 1 .df.ndim → Number of Dimensions
# Returns the number of dimensions (always 2 for DataFrames).
d1 = {'Name':["Alice",'Jam','Jack'],
      'Age':[10,20,30]   
     }
df = pd.DataFrame(d1)
print(df)
print()
print("The dimension is",df.ndim)

    Name  Age
0  Alice   10
1    Jam   20
2   Jack   30

The dimension is 2


In [23]:
# 2 df.shape → Dimensions of the DataFrame
# Returns the (number of rows, number of columns) as a tuple.

d1 = {'Name':["Alice",'Jam','Jack'],
      'Age':[10,20,30]   
     }
df = pd.DataFrame(d1)
print(df)
print()
print("The Shape is",df.ndim)

    Name  Age
0  Alice   10
1    Jam   20
2   Jack   30

The Shape is 2


In [22]:
# 3.df.dtypes → Data Types of Each Column
# Returns the data type of each column.

d1 = {'Name':["Alice",'Jam','Jack'],
      'Age':[10,20,30]   
     }
df = pd.DataFrame(d1)
print(df)
print()
print("The dtype of each column is")
print(df.dtypes)

    Name  Age
0  Alice   10
1    Jam   20
2   Jack   30

The dtype of each column is
Name    object
Age      int64
dtype: object


In [21]:
# 4. df.size → Total Number of Elements
# Returns total elements (rows × columns).

d1 = {'Name':["Alice",'Jam','Jack'],
      'Age':[10,20,30]   
     }
df = pd.DataFrame(d1)
print(df)
print()
print("The size of the dataframe is")
print(df.size)

    Name  Age
0  Alice   10
1    Jam   20
2   Jack   30

The size of the dataframe is
6


In [20]:
# 5.df.index → Index Labels (Row Labels)
# Returns the index object of the DataFrame.

l1 = [[10,20,30],[40,50,60]]
df =  pd.DataFrame(l1,index=['r1','r2'])
print()
print("The indexes of dataframe is")
print(df.index)
print(type(df.index))


The indexes of dataframe is
Index(['r1', 'r2'], dtype='object')
<class 'pandas.core.indexes.base.Index'>


In [19]:
# 6.df.columns → Column Labels
# Returns the column names as an Index object.

d1 = {'Name':["Alice",'Jam','Jack'],
      'Age':[10,20,30]   
     }
df = pd.DataFrame(d1)
print(df)
print()
print("The names of the dataframe is")
print(df.columns)

    Name  Age
0  Alice   10
1    Jam   20
2   Jack   30

The names of the dataframe is
Index(['Name', 'Age'], dtype='object')


In [18]:
# 7.df.values → Converts DataFrame to NumPy Array
# Returns the DataFrame as a NumPy array (without labels).

d1 = {'Name':["Alice",'Jam','Jack'],
      'Age':[10,20,30]   
     }
df = pd.DataFrame(d1)
print(df)
print()
print("The values of the dataframe is")
print(df.values)

    Name  Age
0  Alice   10
1    Jam   20
2   Jack   30

The values of the dataframe is
[['Alice' 10]
 ['Jam' 20]
 ['Jack' 30]]


In [26]:
# 8.df.empty → Checks if DataFrame is Empty
# Returns True if the DataFrame has no elements.

d1 = {'Name':["Alice",'Jam','Jack'],
      'Age':[10,20,30]   
     }
df = pd.DataFrame(d1)
print(df)
print()
print(df.empty)

    Name  Age
0  Alice   10
1    Jam   20
2   Jack   30

False


# DataFrame Methods in Pandas

In [28]:
# A DataFrame in Pandas has a variety of built-in methods that help in data manipulation, transformation, and analysis.

# Data Inspection Methods

In [30]:
# These methods help in understanding the structure of a DataFrame.

In [31]:
# 1.df.info() → Summary of DataFrame
# Gives a concise summary including:
# Number of non-null values
# Data types
# Memory usage

In [33]:
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4.5, 5.5, 6.5]})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       3 non-null      int64  
 1   B       3 non-null      float64
dtypes: float64(1), int64(1)
memory usage: 180.0 bytes


In [34]:
# 2.df.describe(percentiles=None, include=None, exclude=None) → Summary Statistics
# Generates summary statistics for numerical data.

# 🔹 Parameters:

# Parameter	Description
# percentiles	List of percentiles (e.g., [0.25, 0.5, 0.75]). Default is [0.25, 0.5, 0.75].
# include	Include specific data types (e.g., include='all').
# exclude	Exclude specific data types (e.g., exclude=['int']).

In [36]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4.5, 5.5, 6.5]})
df.describe()

Unnamed: 0,A,B
count,3.0,3.0
mean,2.0,5.5
std,1.0,1.0
min,1.0,4.5
25%,1.5,5.0
50%,2.0,5.5
75%,2.5,6.0
max,3.0,6.5


In [37]:
# Quartiles split your data into four equal parts so you can understand how the data is distributed.

# Think of quartiles like dividing a pizza into four equal parts:

# Q1 (First Quartile - 25%) → The value below which 25% of data falls.
# Q2 (Second Quartile - 50% or Median) → The value below which 50% of data falls.
# Q3 (Third Quartile - 75%) → The value below which 75% of data falls.
# Q4 (Maximum - 100%) → The highest value in the dataset.



# Selection & Filtering Methods

In [38]:
# These methods help in selecting specific rows/columns.

In [39]:
# 3.df.head(n=5) → First n Rows
# Returns the first n rows (default is 5).

# Parameter:
# n → Number of rows to return.

In [41]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4.5, 5.5, 6.5]})
df.head(2)

Unnamed: 0,A,B
0,1,4.5
1,2,5.5


In [46]:
# 4.df.tail(n=5) → Last n Rows
# Returns last n rows (default is 5).

In [45]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4.5, 5.5, 6.5]})
df.tail(2)

Unnamed: 0,A,B
1,2,5.5
2,3,6.5


In [43]:
# 5.df.loc[row_labels, column_labels] → Label-based Selection
# Selects rows & columns using labels.

In [47]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4.5, 5.5, 6.5]})
df.loc[0,'A']

1

In [48]:
# 6.df.iloc[row_index, column_index] → Index-based Selection
# Selects rows & columns using integer indexes.

In [49]:
df = pd.DataFrame([[10,20,30],[40,50,60]])
df.loc[0,1]

20

# Sorting Methods

In [50]:
# These methods help in sorting a DataFrame.

In [51]:
# 7.df.sort_values(by, ascending=True, inplace=False) → Sort by Column
# Sorts values based on a column.

# 🔹 Parameters:
# by	Column name(s) to sort by.
# ascending	Sort order (True for ascending, False for descending).
# inplace	Modify original DataFrame (False by default).

In [53]:
df = pd.DataFrame([[10,20,30],[40,50,60]])
df.sort_values(by=[0,1],ascending=False)

Unnamed: 0,0,1,2
1,40,50,60
0,10,20,30


In [54]:
# 8.df.sort_index(ascending=True, inplace=False) → Sort by Index
# Sorts rows based on index.
# df.sort_index()

In [59]:
df = pd.DataFrame([[10,20,30],[40,50,60]])
df.sort_index()

Unnamed: 0,0,1,2
0,10,20,30
1,40,50,60


# Data Manipulation Methods

In [60]:
# These methods help in modifying DataFrames.

In [61]:
# 9.df.rename(columns={'old_name': 'new_name'}) → Rename Columns
# Renames column labels.
# return the new dataframe with the changed column names.

In [66]:
df = pd.DataFrame([[10,20,30],[40,50,60]])
df.rename(columns={0:'Col1',1:'col2',2:'col3'})

Unnamed: 0,Col1,col2,col3
0,10,20,30
1,40,50,60


In [72]:
# 10 . df.drop(labels, axis=0, inplace=False) → Drop Rows/Columns
# Removes specific rows/columns.
# Return the new DataFrame with droped rows or columns.

# 🔹 Parameters:
# labels	Name(s) of rows/columns to drop.
# axis	0 → Drop rows, 1 → Drop columns.
# inplace	Modify original DataFrame (False by default).

In [73]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4.5, 5.5, 6.5]})
df.drop(labels=[0,1],axis=0)

Unnamed: 0,A,B
2,3,6.5


# Aggregation Methods

In [76]:
# Methods to perform calculations.

In [77]:
# 11 df.mean(axis=0) → Column-wise Mean
# Returns the mean of all values.

In [86]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4.5, 5.5, 6.5]})
print(df)
print()
# Mean by rows 
print(df.mean(axis=0))
print()

# Mean by Column
print(df.mean(axis=1))

   A    B
0  1  4.5
1  2  5.5
2  3  6.5

A    2.0
B    5.5
dtype: float64

0    2.75
1    3.75
2    4.75
dtype: float64


In [87]:
# 12.df.sum(axis=0) → Column-wise Sum
# Returns the sum of all values.

In [89]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4.5, 5.5, 6.5]})
print(df)
print()

# Mean by rows (Summation of all the rows)
print(df.sum(axis=0))
print()

# Mean by Column (Summation of all the columns)
print(df.sum(axis=1))

   A    B
0  1  4.5
1  2  5.5
2  3  6.5

A     6.0
B    16.5
dtype: float64

0    5.5
1    7.5
2    9.5
dtype: float64


In [90]:
# 13.df.count() → Count Non-Null Values
# Counts non-null values.

In [98]:
l1=[
    {'Name':'Alice','Age':None},
    {'Name':'Jam','Age':20},
    {'Name':None,'Age':None}
]
df = pd.DataFrame(l1)
print(df)
print(df.count())

    Name   Age
0  Alice   NaN
1    Jam  20.0
2   None   NaN
Name    2
Age     1
dtype: int64


In [99]:
df.count(axis=0)

Name    2
Age     1
dtype: int64

In [100]:
df.count(axis=1)

0    1
1    2
2    0
dtype: int64

In [105]:
# 14. df.isnull() → Check for NaN Values
# Returns True for missing values.
# Return the new DataFrame containing True for the missing values and False for the non-null values.

In [106]:
l1=[
    {'Name':'Alice','Age':None},
    {'Name':'Jam','Age':20},
    {'Name':None,'Age':None}
]
df = pd.DataFrame(l1)
print(df)
print()
print(df.isnull())

    Name   Age
0  Alice   NaN
1    Jam  20.0
2   None   NaN

    Name    Age
0  False   True
1  False  False
2   True   True


In [107]:
# 15. df.fillna(value) → Fill Missing Values
# Replaces NaN with a specific value.

In [108]:
l1=[
    {'Name':'Alice','Age':None},
    {'Name':'Jam','Age':20},
    {'Name':None,'Age':None}
]
df = pd.DataFrame(l1)
print(df)
print()
print(df.fillna(100))

    Name   Age
0  Alice   NaN
1    Jam  20.0
2   None   NaN

    Name    Age
0  Alice  100.0
1    Jam   20.0
2    100  100.0


In [111]:
# 16 df.dropna() → Drop Missing Values
# Removes rows/columns with missing values.

In [112]:
l1=[
    {'Name':'Alice','Age':None},
    {'Name':'Jam','Age':20},
    {'Name':None,'Age':None}
]
df = pd.DataFrame(l1)
print(df)

    Name   Age
0  Alice   NaN
1    Jam  20.0
2   None   NaN



In [113]:
print(df.dropna())

  Name   Age
1  Jam  20.0


In [116]:
print(df.dropna(axis=0))

  Name   Age
1  Jam  20.0


In [118]:
print(df.dropna(axis=1))

Empty DataFrame
Columns: []
Index: [0, 1, 2]
