In [None]:
# This is my deep dive into Python's Pandas library.
'''
Overview:
- Used analyzing, filtering, and manipulating data
- Works best with tabular (table) datasets
'''

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
url = "https://raw.githubusercontent.com/MedlyticsUniversal/Data/main/Week1/diabetes.csv"
col_names = ['preg','plas','pres','skin','test','mass','pedi','age','class']

data = pd.read_csv(url, names=col_names) #reading in file -> dataframe

# Note, pandas has two core data structures
# 1. DataFrame: 2D labeled data structure1 (eg. spreadsheet)
# 2. Series: a 1D labeled array (eg. column)

In [None]:
type(data) # it is indeed a DataFrame object

In [None]:
data.head(10) #to see first 10 rows

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [None]:
data.shape

(768, 9)

In [None]:
data.describe()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
# Example DataFrame

data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Country': ['USA', 'Canada', 'UK']
}

df = pd.DataFrame(data)
print(df)

'''
Output:

     Name  Age Country
0   Alice   25     USA
1     Bob   30  Canada
2 Charlie   35      UK
'''

In [None]:
# Different ways to load in a DataFrame

df = pd.read_csv('data.csv')        # Load from CSV
df = pd.read_excel('data.xlsx')     # Load from Excel
df = pd.DataFrame(data_dict)        # Load from dictionary

In [None]:
# Exploring a DataFrame

df.head()        # First 5 rows
df.tail()        # Last 5 rows
df.info()        # Column types and non-null values
df.describe()    # Summary stats
df.columns       # Column names
df.index         # Row indices

In [None]:
# Accessing and Filtering data

df['Age']                  # Single column
df[['Name', 'Age']]        # Multiple columns
df.iloc[0]                 # First row by position
df.loc[0]                  # First row by label

In [None]:
# Conditional Filtering (masking)

df[df['Age'] > 30]                          # Age over 30
df[df['Country'] == 'USA']                 # Country match
df[(df['Age'] > 25) & (df['Country'] == 'UK')]  # Multiple conditions

In [None]:
# CRUD Operations

# Create (Insert Rows)
new_row = pd.DataFrame([{'Name': 'Diana', 'Age': 28, 'Country': 'Germany'}])
df = pd.concat([df, new_row], ignore_index=True)

# Read (Access Data)
df.loc[df['Name'] == 'Alice']
df.iloc[2]

# Update (Modify Values)
df.loc[df['Name'] == 'Bob', 'Age'] = 31             # Update Bob's age
df['Country'] = df['Country'].str.upper()           # Convert all country names to uppercase

# Delete (Drop Data)
df = df.drop(2, axis=0)                             # Drop row with index 2
df = df[df['Name'] != 'Alice']                      # Drop rows where name == Alice
df = df.drop('Age', axis=1)                         # Drop 'Age' column

In [None]:
# Cleaning data

df.isnull().sum()                          # Count missing values
df.dropna(inplace=True)                   # Drop rows with missing values
df.fillna({'Age': 0}, inplace=True)       # Fill missing Age values with 0
df.rename(columns={'Age': 'Years'}, inplace=True)   # Rename column
df['Years'] = df['Years'].astype(float)   # Convert column to float

In [None]:
# Analyzing Data

df['Country'].value_counts()                       # Count occurrences
df.groupby('Country')['Age'].mean()                # Average age per country
df.sort_values(by='Age', ascending=False)          # Sort by Age descending

In [None]:
# Saving data

df.to_csv('cleaned_data.csv', index=False)
df.to_excel('output.xlsx', index=False)