In [3]:
import pandas as pd


In [4]:
data_dict ={'Name':['Ashish','Raj','Ayush','Shivam','Pradeep'],
            'Age':[20,21,10,12,19],
            'City':['Bihar','Patna','Bihar','shivam','Gaya']}

df= pd.DataFrame(data_dict)
print("DataFrame from dictionary")
print(df)

DataFrame from dictionary
      Name  Age    City
0   Ashish   20   Bihar
1      Raj   21   Patna
2    Ayush   10   Bihar
3   Shivam   12  shivam
4  Pradeep   19    Gaya


In [5]:
# Accessing Row
print(df["Name"])
#Accessing Rows
print(df.iloc[1])
print(df.loc[0])

0     Ashish
1        Raj
2      Ayush
3     Shivam
4    Pradeep
Name: Name, dtype: object
Name      Raj
Age        21
City    Patna
Name: 1, dtype: object
Name    Ashish
Age         20
City     Bihar
Name: 0, dtype: object


In [6]:
# Creating a DataFrame from a List of Lists
data_list = [
    ["Alice", 25, 50000],
    ["Bob", 30, 60000],
    ["Charlie", 35, 70000]
]

# Creating DataFrame
df = pd.DataFrame(data_list, columns=["Name", "Age", "Salary"])

print(df)


      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   35   70000


In [7]:
#Creating a DataFrame from a NumPy Array
import numpy as np

# Creating a NumPy array
data = np.array([
    [101, "Alice", 25, 50000],
    [102, "Bob", 30, 60000],
    [103, "Charlie", 35, 70000]
])

# Creating DataFrame
df = pd.DataFrame(data, columns=["ID", "Name", "Age", "Salary"])

print(df)



    ID     Name Age Salary
0  101    Alice  25  50000
1  102      Bob  30  60000
2  103  Charlie  35  70000


In [8]:
# 📌 Exploring Data in Pandas
  head(n) & tail(n) – View the first & last n rows
  info() – Summary of dataset (data types, memory usage)
  describe() – Statistical summary (mean, min, max, etc.)
  shape & columns – Get dimensions & column names
  value_counts() – Count unique values in a column


SyntaxError: source code string cannot contain null bytes (<string>)

In [None]:
import pandas as pd

# Sample DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva", "Frank"],
    "Age": [25, 30, 35, 40, 45, 50],
    "Salary": [50000, 60000, 70000, 80000, 90000, 100000]
}

df = pd.DataFrame(data)

# View first 3 rows
print(df.head(3))

# View last 2 rows
print(df.tail(2))
#2️⃣ info() – Summary of Dataset
print(df.info())
#3️⃣ describe() – Statistical Summary
print(df.describe())
#4️⃣ shape & columns – Get Dimensions & Column Names
print(df.shape)      # Output: (6, 3) -> 6 rows, 3 columns
print(df.columns)    # Output: Index(['Name', 'Age', 'Salary'], dtype='object')
#5️⃣ value_counts() – Count Unique Values in a Column
df["Age"].value_counts()


**DATA CLEANING & MANIPULATION**

In [None]:
#Handling Missing Values
import pandas as pd

# Creating a sample DataFrame with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, None, 22, 29],
    'City': ['New York', None, 'Los Angeles', 'Chicago', 'Houston'],
    'Salary': [50000, 60000, 55000, None, 65000]
}
df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Checking for missing values
print("\nMissing Values Count:")
print(df.isnull().sum())

# Handling missing values
# 1. Dropping rows with missing values
df_dropna = df.dropna()
print("\nDataFrame after dropping rows with missing values:")
print(df_dropna)

# 2. Filling missing values with a specific value
df_fillna = df.fillna({'Age': df['Age'].mean(), 'City': 'Unknown', 'Salary': df['Salary'].median()})
print("\nDataFrame after filling missing values:")
print(df_fillna)

# 3. Interpolating missing values
df_interpolated = df.interpolate()
print("\nDataFrame after interpolation:")
print(df_interpolated)

# 4. Replacing specific values
df_replaced = df_fillna.replace({'Unknown': 'Not Specified'})
print("\nDataFrame after replacing 'Unknown' with 'Not Specified':")
print(df_replaced)


In [None]:
#Filtering Data
# Filtering Data using loc[]
filtered_df_loc = df_fillna.loc[df_fillna['Age'] > 25]
print("\nFiltered DataFrame (Age > 25) using loc[]:")
print(filtered_df_loc)

# Filtering Data using query()
filtered_df_query = df_fillna.query("Salary > 55000")
print("\nFiltered DataFrame (Salary > 55000) using query():")
print(filtered_df_query)


In [None]:
#Grouping & Aggregation-for summary statistics
# Grouping & Aggregation using groupby()
grouped = df_fillna.groupby('City').agg({'Age': 'mean', 'Salary': 'sum'})
print("\nGrouped & Aggregated Data (Mean Age, Sum Salary by City):")
print(grouped)

In [None]:
#Merging & Joining-for combining datasets
# Merging & Joining using merge() and concat()
additional_data = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Department': ['HR', 'IT', 'Finance', 'Marketing', 'Sales']
})
merged_df = pd.merge(df_fillna, additional_data, on='Name')
print("\nMerged DataFrame (Using merge() on Name):")
print(merged_df)

# Concatenating another dataset
extra_data = pd.DataFrame({
    'Name': ['Frank', 'Grace'],
    'Age': [27, 31],
    'City': ['Seattle', 'Boston'],
    'Salary': [58000, 62000]
})
concatenated_df = pd.concat([df_fillna, extra_data], ignore_index=True)
print("\nConcatenated DataFrame (Using concat() to add new rows):")
print(concatenated_df)



**Merging and Combining Data**

In [10]:
left = pd.DataFrame({
    'ID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Score': [85, 90, 78, 88]
})

right = pd.DataFrame({
    'ID': [3, 4, 5, 6],
    'Name': ['Charlie', 'David', 'Eve', 'Frank'],
    'Grade': ['B', 'A', 'C', 'B']
})

display(left, right)

Unnamed: 0,ID,Name,Score
0,1,Alice,85
1,2,Bob,90
2,3,Charlie,78
3,4,David,88


Unnamed: 0,ID,Name,Grade
0,3,Charlie,B
1,4,David,A
2,5,Eve,C
3,6,Frank,B


1. merge(): Joining two datasets based on a common column

Inner Join (Default) - Keeps only matching IDs

In [11]:
merged_inner = left.merge(right, on='ID', how='inner')
display(merged_inner)

Unnamed: 0,ID,Name_x,Score,Name_y,Grade
0,3,Charlie,78,Charlie,B
1,4,David,88,David,A


Left Join - Keeps all from left, fills NaN for missing data from right

In [12]:
grouped_left = left.merge(right, on='ID', how='left')
display(grouped_left)

Unnamed: 0,ID,Name_x,Score,Name_y,Grade
0,1,Alice,85,,
1,2,Bob,90,,
2,3,Charlie,78,Charlie,B
3,4,David,88,David,A


Outer Join - Keeps all records from both datasets

In [13]:
grouped_outer = left.merge(right, on='ID', how='outer')
display(grouped_outer)

Unnamed: 0,ID,Name_x,Score,Name_y,Grade
0,1,Alice,85.0,,
1,2,Bob,90.0,,
2,3,Charlie,78.0,Charlie,B
3,4,David,88.0,David,A
4,5,,,Eve,C
5,6,,,Frank,B


2. concat(): Stacking datasets vertically or horizontally

In [14]:
df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2'],
    'B': ['B0', 'B1', 'B2']
})
df2 = pd.DataFrame({
    'A': ['A3', 'A4', 'A5'],
    'B': ['B3', 'B4', 'B5']
})

In [15]:
# Vertical Concatenation (axis=0)
df_concat_v = pd.concat([df1, df2], axis=0)
display(df_concat_v)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
0,A3,B3
1,A4,B4
2,A5,B5


In [16]:
# Horizontal Concatenation (axis=1)
df3 = pd.DataFrame({
    'C': ['C0', 'C1', 'C2']
})

In [17]:
df_concat_h = pd.concat([df1, df3], axis=1)
display(df_concat_h)

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


3. join(): Similar to merge() but works on index


In [18]:
left_indexed = left.set_index('ID')
right_indexed = right.set_index('ID')

In [19]:
# Joining on index
joined_df = left_indexed.join(right_indexed, how='inner', lsuffix='_left', rsuffix='_right')
display(joined_df)

Unnamed: 0_level_0,Name_left,Score,Name_right,Grade
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,Charlie,78,Charlie,B
4,David,88,David,A


4. combine_first(): Filling missing values from another dataset

In [21]:
df_a = pd.DataFrame({
    'A': [np.nan, 2, np.nan, 4],
    'B': [5, np.nan, 7, np.nan]
})
df_b = pd.DataFrame({
    'A': [1, np.nan, 3, np.nan],
    'B': [np.nan, 6, np.nan, 8]
})

In [22]:
# Filling missing values using another DataFrame
df_combined = df_a.combine_first(df_b)
display(df_combined)

Unnamed: 0,A,B
0,1.0,5.0
1,2.0,6.0
2,3.0,7.0
3,4.0,8.0
