In [1]:
# Phase 4: Pandas Fundamentals
# Introduction to Pandas - Python Data Analysis Library

import pandas as pd
import numpy as np

print('Pandas version:', pd.__version__)
print('NumPy version:', np.__version__)

print("\n✅ Pandas is successfully installed!")
print("📊 Pandas provides two main data structures:")
print("   1. Series - 1D labeled array")
print("   2. DataFrame - 2D labeled data structure (like a table)")


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Creating Pandas Series

# From a list
series1 = pd.Series([10, 20, 30, 40, 50])
print("Series from list:")
print(series1)
print("\nType:", type(series1))
print()

# From a list with custom index
series2 = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])
print("Series with custom index:")
print(series2)
print()

# From a dictionary
data_dict = {'Apple': 150, 'Banana': 80, 'Cherry': 120, 'Date': 95}
series3 = pd.Series(data_dict)
print("Series from dictionary:")
print(series3)
print()

# From NumPy array
np_array = np.array([1, 2, 3, 4, 5])
series4 = pd.Series(np_array, index=['Mon', 'Tue', 'Wed', 'Thu', 'Fri'])
print("Series from NumPy array:")
print(series4)


In [None]:
# Creating Pandas DataFrames

# Method 1: From a dictionary
data = {
    'Name': ['John', 'Emma', 'Michael', 'Sophia', 'William'],
    'Age': [28, 24, 35, 29, 31],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'Salary': [75000, 82000, 95000, 68000, 88000]
}

df = pd.DataFrame(data)
print("DataFrame from dictionary:")
print(df)
print("\nDataFrame shape:", df.shape)
print("DataFrame columns:", df.columns.tolist())
print("DataFrame index:", df.index.tolist())
print()

# Method 2: From a list of lists
data_list = [
    ['Alice', 26, 'Boston', 72000],
    ['Bob', 32, 'Seattle', 91000],
    ['Carol', 28, 'Miami', 79000]
]

df2 = pd.DataFrame(data_list, columns=['Name', 'Age', 'City', 'Salary'])
print("DataFrame from list of lists:")
print(df2)
print()

# Method 3: From NumPy array
np_data = np.array([
    [1, 100, 'A'],
    [2, 200, 'B'],
    [3, 300, 'C']
])

df3 = pd.DataFrame(np_data, columns=['ID', 'Value', 'Grade'])
print("DataFrame from NumPy array:")
print(df3)


In [None]:
# Basic DataFrame Operations and Information

# Create sample DataFrame
employees = pd.DataFrame({
    'Employee_ID': [101, 102, 103, 104, 105, 106, 107, 108],
    'Name': ['John', 'Emma', 'Michael', 'Sophia', 'William', 'Olivia', 'James', 'Ava'],
    'Department': ['Sales', 'IT', 'HR', 'Sales', 'IT', 'HR', 'Sales', 'IT'],
    'Salary': [75000, 82000, 68000, 79000, 88000, 71000, 85000, 90000],
    'Years_Experience': [5, 3, 8, 4, 6, 2, 7, 5]
})

print("Sample DataFrame:")
print(employees)
print("\n" + "="*60)

# Display first few rows
print("\nFirst 3 rows (head):")
print(employees.head(3))

# Display last few rows
print("\nLast 3 rows (tail):")
print(employees.tail(3))

# DataFrame info
print("\nDataFrame Info:")
print(employees.info())

# Basic statistics
print("\nBasic Statistics:")
print(employees.describe())

# Column data types
print("\nData types:")
print(employees.dtypes)


In [None]:
# Data Selection and Filtering

# Selecting single column (returns Series)
print("Select single column (Name):")
print(employees['Name'])
print("\nType:", type(employees['Name']))
print()

# Selecting multiple columns (returns DataFrame)
print("Select multiple columns:")
print(employees[['Name', 'Salary', 'Department']])
print()

# Selecting rows by index position (iloc)
print("Select row at index 2 using iloc:")
print(employees.iloc[2])
print()

print("Select first 3 rows and first 2 columns:")
print(employees.iloc[0:3, 0:2])
print()

# Selecting rows by label (loc)
print("Select row with index label 1 using loc:")
print(employees.loc[1])
print()

# Filtering data with conditions
print("Employees with Salary > 80000:")
high_salary = employees[employees['Salary'] > 80000]
print(high_salary)
print()

print("Employees in IT Department:")
it_employees = employees[employees['Department'] == 'IT']
print(it_employees)
print()

# Multiple conditions
print("Employees in Sales with Salary > 75000:")
filtered = employees[(employees['Department'] == 'Sales') & (employees['Salary'] > 75000)]
print(filtered)


In [None]:
# Handling Missing Data

# Create DataFrame with missing values
data_with_nan = {
    'Name': ['John', 'Emma', 'Michael', None, 'William'],
    'Age': [28, None, 35, 29, 31],
    'Salary': [75000, 82000, None, 68000, 88000],
    'City': ['New York', 'LA', 'Chicago', 'Houston', None]
}

df_nan = pd.DataFrame(data_with_nan)
print("DataFrame with missing values:")
print(df_nan)
print()

# Check for missing values
print("Check for missing values (isnull):")
print(df_nan.isnull())
print()

print("Count of missing values per column:")
print(df_nan.isnull().sum())
print()

# Fill missing values
print("Fill missing values with specific value:")
df_filled = df_nan.fillna({'Name': 'Unknown', 'Age': 0, 'Salary': 0, 'City': 'Unknown'})
print(df_filled)
print()

# Fill with mean/median
print("Original Salary column:")
print(df_nan['Salary'])
print("\nFill missing Salary with mean:")
df_mean = df_nan.copy()
df_mean['Salary'] = df_nan['Salary'].fillna(df_nan['Salary'].mean())
print(df_mean['Salary'])
print()

# Drop rows with missing values
print("Drop rows with ANY missing values:")
df_dropped = df_nan.dropna()
print(df_dropped)
print()

# Drop rows where specific column has missing values
print("Drop rows where 'Age' is missing:")
df_dropped_age = df_nan.dropna(subset=['Age'])
print(df_dropped_age)


In [None]:
# Adding and Modifying Data

# Create sample DataFrame
df = pd.DataFrame({
    'Name': ['John', 'Emma', 'Michael'],
    'Age': [28, 24, 35],
    'Salary': [75000, 82000, 95000]
})

print("Original DataFrame:")
print(df)
print()

# Add new column
df['Department'] = ['Sales', 'IT', 'HR']
print("Added 'Department' column:")
print(df)
print()

# Add calculated column
df['Salary_in_K'] = df['Salary'] / 1000
print("Added calculated column 'Salary_in_K':")
print(df)
print()

# Modify existing column
df['Age'] = df['Age'] + 1
print("Increased Age by 1:")
print(df)
print()

# Add new row using loc
df.loc[3] = ['Sophia', 30, 88000, 'Sales', 88]
print("Added new row:")
print(df)
print()

# Add new row using concat
new_row = pd.DataFrame({'Name': ['William'], 'Age': [31], 'Salary': [78000], 
                        'Department': ['IT'], 'Salary_in_K': [78]})
df = pd.concat([df, new_row], ignore_index=True)
print("Added row using concat:")
print(df)
print()

# Delete column
df_dropped = df.drop('Salary_in_K', axis=1)
print("Dropped 'Salary_in_K' column:")
print(df_dropped)
print()

# Delete row
df_dropped_row = df.drop(2, axis=0)
print("Dropped row at index 2:")
print(df_dropped_row)


In [None]:
# Sorting and Ranking

# Create sample DataFrame
products = pd.DataFrame({
    'Product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Headphones'],
    'Price': [1200, 25, 75, 350, 150],
    'Stock': [15, 100, 50, 25, 40],
    'Rating': [4.5, 4.0, 4.2, 4.7, 4.3]
})

print("Original DataFrame:")
print(products)
print()

# Sort by single column
print("Sort by Price (ascending):")
print(products.sort_values('Price'))
print()

print("Sort by Price (descending):")
print(products.sort_values('Price', ascending=False))
print()

# Sort by multiple columns
print("Sort by Rating (desc) then Price (asc):")
print(products.sort_values(['Rating', 'Price'], ascending=[False, True]))
print()

# Sort by index
products_reindexed = products.set_index('Product')
print("DataFrame with Product as index:")
print(products_reindexed)
print("\nSorted by index:")
print(products_reindexed.sort_index())
print()

# Ranking
print("Rank products by Price:")
products['Price_Rank'] = products['Price'].rank()
print(products)
print()

print("Rank by Rating (descending - best first):")
products['Rating_Rank'] = products['Rating'].rank(ascending=False)
print(products[['Product', 'Rating', 'Rating_Rank']])


In [None]:
# Basic Statistical Operations

# Create sample sales data
sales_data = pd.DataFrame({
    'Store': ['Store A', 'Store B', 'Store C', 'Store D', 'Store E'],
    'Revenue': [125000, 98000, 156000, 87000, 142000],
    'Customers': [450, 380, 520, 310, 490],
    'Transactions': [680, 550, 720, 480, 650]
})

print("Sales Data:")
print(sales_data)
print("\n" + "="*60)

# Basic statistics
print("\nSum of all numeric columns:")
print(sales_data.sum(numeric_only=True))
print()

print("Mean (average):")
print(sales_data.mean(numeric_only=True))
print()

print("Median:")
print(sales_data.median(numeric_only=True))
print()

print("Standard deviation:")
print(sales_data.std(numeric_only=True))
print()

print("Minimum values:")
print(sales_data.min(numeric_only=True))
print()

print("Maximum values:")
print(sales_data.max(numeric_only=True))
print()

# Describe - comprehensive statistics
print("Comprehensive statistics (describe):")
print(sales_data.describe())
print()

# Correlation
print("Correlation between columns:")
print(sales_data[['Revenue', 'Customers', 'Transactions']].corr())
print()

# Custom calculations
sales_data['Revenue_per_Customer'] = sales_data['Revenue'] / sales_data['Customers']
sales_data['Avg_Transaction_Value'] = sales_data['Revenue'] / sales_data['Transactions']

print("DataFrame with calculated metrics:")
print(sales_data)
