# Pandas Tutorial
## Introduction to Pandas DataFrames and Data Analysis
Following MOOC.fi Data Analysis with Python course

## 1. Importing Pandas

In [None]:
import pandas as pd
import numpy as np
print(f"Pandas version: {pd.__version__}")

## 2. Creating Series

In [None]:
# Create a Series from a list
s = pd.Series([10, 20, 30, 40, 50])
print("Series:")
print(s)

# Series with custom index
s_custom = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])
print("\nSeries with custom index:")
print(s_custom)

## 3. Creating DataFrames

In [None]:
# Create DataFrame from dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['New York', 'Paris', 'London', 'Tokyo'],
    'Salary': [50000, 60000, 70000, 80000]
}
df = pd.DataFrame(data)
print("DataFrame:")
print(df)

## 4. DataFrame Information

In [None]:
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nFirst 3 rows:")
print(df.head(3))
print("\nLast 2 rows:")
print(df.tail(2))
print("\nInfo:")
df.info()

## 5. Selecting Data

In [None]:
# Select a single column
print("Age column:")
print(df['Age'])

# Select multiple columns
print("\nName and City columns:")
print(df[['Name', 'City']])

# Select rows by index
print("\nFirst row:")
print(df.iloc[0])

# Select rows by condition
print("\nRows where Age > 30:")
print(df[df['Age'] > 30])

## 6. Reading CSV Files

In [None]:
# Read the sales data CSV
sales_df = pd.read_csv('../datasets/sales_data.csv')
print("Sales Data:")
print(sales_df.head())
print("\nDataFrame Info:")
sales_df.info()

## 7. Data Analysis Operations

In [None]:
# Statistical summary
print("Statistical Summary:")
print(sales_df.describe())

# Group by operations
print("\nTotal Sales by Category:")
print(sales_df.groupby('Category')['Sales'].sum())

# Average sales by region
print("\nAverage Sales by Region:")
print(sales_df.groupby('Region')['Sales'].mean())

## 8. Sorting and Ranking

In [None]:
# Sort by sales in descending order
print("Top 5 Sales:")
print(sales_df.sort_values('Sales', ascending=False).head())

# Add a rank column
sales_df_ranked = sales_df.copy()
sales_df_ranked['Rank'] = sales_df_ranked['Sales'].rank(ascending=False)
print("\nData with Rankings:")
print(sales_df_ranked.sort_values('Rank').head())

## 9. Handling Missing Data

In [None]:
# Create a DataFrame with missing values
df_missing = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
})
print("DataFrame with missing values:")
print(df_missing)

# Check for missing values
print("\nMissing values:")
print(df_missing.isnull())

# Fill missing values
print("\nFill with 0:")
print(df_missing.fillna(0))

# Drop rows with missing values
print("\nDrop rows with NaN:")
print(df_missing.dropna())

## Practice Exercise
Try to solve the following using the student_scores.csv:
1. Load the student scores dataset
2. Calculate the average score for each student across all subjects
3. Find the student with the highest average score
4. Calculate the class average for each subject
5. Filter students who scored above 85 in Math

In [None]:
# Your solution here
