DATA FRAME CREATION

In [1]:
import pandas as pd
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
}

df = pd.DataFrame(data)
print("DataFrame created from a dictionary:")
print(df)


DataFrame created from a dictionary:
      Name  Age         City
0    Alice   24     New York
1      Bob   27  Los Angeles
2  Charlie   22      Chicago
3    David   32      Houston


In [None]:
df_csv = pd.read_csv('path/to/your/file.csv')
print("DataFrame created from a CSV file:")
print(df_csv.head())


DATA INSPECTION

In [2]:
print("First 5 rows of the DataFrame:")
print(df.head())
print("\nLast 5 rows of the DataFrame:")
print(df.tail())
print("\nShape of the DataFrame:")
print(df.shape)
print("\nColumn names of the DataFrame:")
print(df.columns)
print("\nData types of the DataFrame:")
print(df.dtypes)
print("\nSummary of the DataFrame:")
print(df.info())


First 5 rows of the DataFrame:
      Name  Age         City
0    Alice   24     New York
1      Bob   27  Los Angeles
2  Charlie   22      Chicago
3    David   32      Houston

Last 5 rows of the DataFrame:
      Name  Age         City
0    Alice   24     New York
1      Bob   27  Los Angeles
2  Charlie   22      Chicago
3    David   32      Houston

Shape of the DataFrame:
(4, 3)

Column names of the DataFrame:
Index(['Name', 'Age', 'City'], dtype='object')

Data types of the DataFrame:
Name    object
Age      int64
City    object
dtype: object

Summary of the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   City    4 non-null      object
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes
None


DATA EXPLORATION :

In [3]:
print("\nDescriptive statistics of the DataFrame:")
print(df.describe())

# Checking for missing values 
print("\nMissing values in the DataFrame:")
print(df.isnull().sum())

# Counting unique values
print("\nValue counts for the 'City' column:")
print(df['City'].value_counts())

# Grouping
print("\nMean age by city:")
print(df.groupby('City')['Age'].mean())

# Filtering 
print("\nRows where age is greater than 25:")
print(df[df['Age'] > 25])



Descriptive statistics of the DataFrame:
             Age
count   4.000000
mean   26.250000
std     4.349329
min    22.000000
25%    23.500000
50%    25.500000
75%    28.250000
max    32.000000

Missing values in the DataFrame:
Name    0
Age     0
City    0
dtype: int64

Value counts for the 'City' column:
City
New York       1
Los Angeles    1
Chicago        1
Houston        1
Name: count, dtype: int64

Mean age by city:
City
Chicago        22.0
Houston        32.0
Los Angeles    27.0
New York       24.0
Name: Age, dtype: float64

Rows where age is greater than 25:
    Name  Age         City
1    Bob   27  Los Angeles
3  David   32      Houston


In [4]:
print("Selecting the 'Name' column:")
print(df['Name'])
print("Selecting the 'Name' and 'Age' columns:")
print(df[['Name', 'Age']])


print("Selecting the first row:")
print(df.iloc[0])
print("Selecting the first and third rows:")
print(df.iloc[[0, 2]])


df.set_index('Name', inplace=True)
print("Selecting the row with label 'Alice':")
print(df.loc['Alice'])
df.reset_index(inplace=True)


Selecting the 'Name' column:
0      Alice
1        Bob
2    Charlie
3      David
Name: Name, dtype: object
Selecting the 'Name' and 'Age' columns:
      Name  Age
0    Alice   24
1      Bob   27
2  Charlie   22
3    David   32
Selecting the first row:
Name       Alice
Age           24
City    New York
Name: 0, dtype: object
Selecting the first and third rows:
      Name  Age      City
0    Alice   24  New York
2  Charlie   22   Chicago
Selecting the row with label 'Alice':
Age           24
City    New York
Name: Alice, dtype: object


DATA MANIPULATION : 

In [5]:
# Adding a new column 
df['Salary'] = [70000, 80000, 50000, 120000]
print("DataFrame after adding 'Salary' column:")
print(df)

# Modifying the column
df['Age'] = df['Age'] + 1
print("DataFrame after modifying 'Age' column:")
print(df)

# Renaming columns
df.rename(columns={'Name': 'Employee Name', 'Age': 'Employee Age'}, inplace=True)
print("DataFrame after renaming columns:")
print(df)

# Dropping the 'Salary' column
df.drop(columns=['Salary'], inplace=True)
print("DataFrame after dropping 'Salary' column:")
print(df)

# Applying a function
df['Employee Age'] = df['Employee Age'].apply(lambda x: x * 2)
print("DataFrame after applying a function to 'Employee Age' column:")
print(df)

# Creating another DataFrame
data2 = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Department': ['HR', 'Engineering', 'Marketing', 'Finance']
}

df2 = pd.DataFrame(data2)

# Merging 
merged_df = pd.merge(df, df2, left_on='Employee Name', right_on='Name')
print("Merged DataFrame:")
print(merged_df)


DataFrame after adding 'Salary' column:
      Name  Age         City  Salary
0    Alice   24     New York   70000
1      Bob   27  Los Angeles   80000
2  Charlie   22      Chicago   50000
3    David   32      Houston  120000
DataFrame after modifying 'Age' column:
      Name  Age         City  Salary
0    Alice   25     New York   70000
1      Bob   28  Los Angeles   80000
2  Charlie   23      Chicago   50000
3    David   33      Houston  120000
DataFrame after renaming columns:
  Employee Name  Employee Age         City  Salary
0         Alice            25     New York   70000
1           Bob            28  Los Angeles   80000
2       Charlie            23      Chicago   50000
3         David            33      Houston  120000
DataFrame after dropping 'Salary' column:
  Employee Name  Employee Age         City
0         Alice            25     New York
1           Bob            28  Los Angeles
2       Charlie            23      Chicago
3         David            33      Houston
Data