# Introduction to Pandas for Beginners

---

Pandas, a powerful library for data manipulation and analysis in Python.

Pandas provides data structures like Series and DataFrame to handle structured data efficiently.

### Importing Pandas

In [9]:
import pandas as pd

In [10]:
import numpy as np

## Chapter 6: Data Transformation

In [11]:
# Sample DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 30, 45],
    'City': ['NY', 'LA', 'NY', 'LA', 'SF']
})
df

Unnamed: 0,Name,Age,City
0,Alice,25,NY
1,Bob,30,LA
2,Charlie,35,NY
3,David,30,LA
4,Eve,45,SF


In [12]:
df['City'] = df['City'].replace({
    'NY': 'New York',
    'LA': 'Los Angeles',
    'SF': 'San Francisco'
})

df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,New York
3,David,30,Los Angeles
4,Eve,45,San Francisco


In [13]:
# Sample DataFrame with incorrect data
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, -35, 40, -45],  # Age -35 is incorrect
    'City': ['New York', 'Los Angeles', 'New York', 'Los Angeles', 'San Francisco']
})
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,-35,New York
3,David,40,Los Angeles
4,Eve,-45,San Francisco


In [14]:
df['Age'] = df['Age'].apply(lambda x: abs(x) if x < 0 else x)
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,New York
3,David,40,Los Angeles
4,Eve,45,San Francisco


In [15]:
df = pd.read_csv('./data/breast_cancer_practice.csv')
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

In [16]:
df = df[['diagnosis', 'id', 'radius_mean', 'texture_mean', 'perimeter_mean',
         'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean']]

df

Unnamed: 0,diagnosis,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean
0,M,842302,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010
1,M,842517,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690
2,M,84300903,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740
3,M,84348301,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140
4,M,84358402,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800
...,...,...,...,...,...,...,...,...,...
566,M,926424,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390
567,M,926682,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400
568,M,926954,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251
569,M,927241,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140


In [17]:
df = df.rename(columns={'compactness_mean': 'Compactness'})
df

Unnamed: 0,diagnosis,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,Compactness,concavity_mean
0,M,842302,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010
1,M,842517,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690
2,M,84300903,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740
3,M,84348301,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140
4,M,84358402,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800
...,...,...,...,...,...,...,...,...,...
566,M,926424,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390
567,M,926682,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400
568,M,926954,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251
569,M,927241,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140


In [18]:
df['radius_mean'] = np.ceil(df['radius_mean']).astype(int)
df

Unnamed: 0,diagnosis,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,Compactness,concavity_mean
0,M,842302,18,10.38,122.80,1001.0,0.11840,0.27760,0.30010
1,M,842517,21,17.77,132.90,1326.0,0.08474,0.07864,0.08690
2,M,84300903,20,21.25,130.00,1203.0,0.10960,0.15990,0.19740
3,M,84348301,12,20.38,77.58,386.1,0.14250,0.28390,0.24140
4,M,84358402,21,14.34,135.10,1297.0,0.10030,0.13280,0.19800
...,...,...,...,...,...,...,...,...,...
566,M,926424,22,22.39,142.00,1479.0,0.11100,0.11590,0.24390
567,M,926682,21,28.25,131.20,1261.0,0.09780,0.10340,0.14400
568,M,926954,17,28.08,108.30,858.1,0.08455,0.10230,0.09251
569,M,927241,21,29.33,140.10,1265.0,0.11780,0.27700,0.35140


In [19]:
df['radius_mean'] = df['radius_mean'] + 1
df

Unnamed: 0,diagnosis,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,Compactness,concavity_mean
0,M,842302,19,10.38,122.80,1001.0,0.11840,0.27760,0.30010
1,M,842517,22,17.77,132.90,1326.0,0.08474,0.07864,0.08690
2,M,84300903,21,21.25,130.00,1203.0,0.10960,0.15990,0.19740
3,M,84348301,13,20.38,77.58,386.1,0.14250,0.28390,0.24140
4,M,84358402,22,14.34,135.10,1297.0,0.10030,0.13280,0.19800
...,...,...,...,...,...,...,...,...,...
566,M,926424,23,22.39,142.00,1479.0,0.11100,0.11590,0.24390
567,M,926682,22,28.25,131.20,1261.0,0.09780,0.10340,0.14400
568,M,926954,18,28.08,108.30,858.1,0.08455,0.10230,0.09251
569,M,927241,22,29.33,140.10,1265.0,0.11780,0.27700,0.35140


In [20]:
data = {'Name': ['John', 'Anna', 'Peter'], 'Age': [28, 24, 35]}
data = pd.DataFrame(data)
data['Cumulative_sum'] = data['Age'].cumsum()
data

Unnamed: 0,Name,Age,Cumulative_sum
0,John,28,28
1,Anna,24,52
2,Peter,35,87


In [21]:
# replace_M = {
#     'M': 'Malignant',
#     'B': 'Benign'
# }
# df['diagnosis'] = df['diagnosis'].replace(replace_M)

df['diagnosis_full'] = df['diagnosis'].apply(lambda x: "Malignant" if x == "M" else "Benign")
df

Unnamed: 0,diagnosis,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,Compactness,concavity_mean,diagnosis_full
0,M,842302,19,10.38,122.80,1001.0,0.11840,0.27760,0.30010,Malignant
1,M,842517,22,17.77,132.90,1326.0,0.08474,0.07864,0.08690,Malignant
2,M,84300903,21,21.25,130.00,1203.0,0.10960,0.15990,0.19740,Malignant
3,M,84348301,13,20.38,77.58,386.1,0.14250,0.28390,0.24140,Malignant
4,M,84358402,22,14.34,135.10,1297.0,0.10030,0.13280,0.19800,Malignant
...,...,...,...,...,...,...,...,...,...,...
566,M,926424,23,22.39,142.00,1479.0,0.11100,0.11590,0.24390,Malignant
567,M,926682,22,28.25,131.20,1261.0,0.09780,0.10340,0.14400,Malignant
568,M,926954,18,28.08,108.30,858.1,0.08455,0.10230,0.09251,Malignant
569,M,927241,22,29.33,140.10,1265.0,0.11780,0.27700,0.35140,Malignant


## Chapter 7: Grouping and Aggregation

## Chapter 8: Merging and Joining

In [22]:
# Merging data
data1 = {'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Charlie']}
data2 = {'ID': [1, 2, 3], 'Score': [85, 90, 95]}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

In [23]:
# Concatenating data
data1 = {'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Charlie']}
data2 = {'ID': [4, 5, 6], 'Name': ['John', 'Ellie', 'Will']}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

## Chapter 9: Save data