In [1]:
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35]
}
df = pd.DataFrame(data)

In [2]:
df

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35


In [3]:
df['City'] = ['New York', 'Miami', 'Chicago']
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Miami
2,Charlie,35,Chicago


In [4]:
df['Age Group'] = ['Young' if age < 30 else 'Mature' for age in df['Age']]

In [5]:
df

Unnamed: 0,Name,Age,City,Age Group
0,Alice,25,New York,Young
1,Bob,30,Miami,Mature
2,Charlie,35,Chicago,Mature


In [6]:
df = df.drop('Age Group', axis=1)

In [7]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Miami
2,Charlie,35,Chicago


In [8]:
df = df.drop(1)

In [9]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
2,Charlie,35,Chicago


In [10]:
df_copy = df.copy()

In [12]:
df_copy

Unnamed: 0,Name,Age,City
0,Alice,25,New York
2,Charlie,35,Chicago


In [13]:
df = df.assign(Discount=[5, 10], Total_Spend=[100, 200])

In [14]:
df

Unnamed: 0,Name,Age,City,Discount,Total_Spend
0,Alice,25,New York,5,100
2,Charlie,35,Chicago,10,200


## Accessing Data: Using df.iloc[] and df.loc[]

In [15]:
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Seattle']
}
df = pd.DataFrame(data)

In [16]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [18]:
row = df.loc[2]
row

Name    Charlie
Age          35
City    Chicago
Name: 2, dtype: object

In [19]:
cell = df.loc[3, 'City']
cell

'Houston'

In [20]:
subset_rows = df.loc[1:3]
subset_rows

Unnamed: 0,Name,Age,City
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


In [21]:
subset = df.loc[1:4, ['Name', 'City']]
subset

Unnamed: 0,Name,City
1,Bob,Los Angeles
2,Charlie,Chicago
3,David,Houston
4,Eva,Seattle


In [23]:
mask = df['Age'] > 30
filtered = df.loc[mask].iloc[:, 0]
filtered

2    Charlie
3      David
4        Eva
Name: Name, dtype: object

In [24]:
df.iloc[1:4]

Unnamed: 0,Name,Age,City
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


## Sampling and Previewing Data: Using df.sample() and df.head()

In [26]:
import pandas as pd
df = pd.read_csv('model_logs_100.csv')

In [27]:
df.head(3)

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
0,2023-01-01,Explain quantum computing in simple terms,70,285
1,2023-01-02,Write a sci-fi short story set in 2050,93,401
2,2023-01-03,Generate marketing copy for a new tech product,78,377


In [29]:
random_rows = df.sample(n=3)
random_rows

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
34,2023-02-04,Generate marketing copy for a new tech product,113,251
76,2023-03-18,Generate a creative story about space travel,55,486
62,2023-03-04,Create a recipe using plant-based ingredients,70,263


In [33]:
random_sample = df.sample(n=5, random_state=67)
random_sample

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
17,2023-01-18,Outline the ethical considerations of AI,61,227
24,2023-01-25,Outline the ethical considerations of AI,68,339
64,2023-03-06,Write a poem about the future of technology,68,447
93,2023-04-04,Outline the ethical considerations of AI,43,405
97,2023-04-08,Explain quantum computing in simple terms,52,462


In [35]:
fraction_sample = df.sample(frac=0.2)
fraction_sample

Unnamed: 0,Date,Prompt,Response Time (ms),Tokens Generated
33,2023-02-03,Create a recipe using plant-based ingredients,67,256
49,2023-02-19,Generate marketing copy for a new tech product,114,205
4,2023-01-05,Create a recipe using plant-based ingredients,84,316
22,2023-01-23,Create a recipe using plant-based ingredients,35,325
55,2023-02-25,Describe the impact of AI on healthcare,63,474
20,2023-01-21,Write a sci-fi short story set in 2050,36,439
48,2023-02-18,Create a recipe using plant-based ingredients,98,263
69,2023-03-11,Create a recipe using plant-based ingredients,111,257
59,2023-03-01,Create a recipe using plant-based ingredients,86,494
44,2023-02-14,Write a sci-fi short story set in 2050,87,244


## Filtering Data: Masks and pandas.Series.between()

In [36]:
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Seattle']
}
df = pd.DataFrame(data)

In [37]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [38]:
mask = df['Age'] > 30

In [39]:
mask

0    False
1    False
2     True
3     True
4     True
Name: Age, dtype: bool

In [40]:
filtered_df = df[mask]
filtered_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [41]:
mask = (df['Age'] > 30) & (df['City'] != 'Houston')
filtered_df = df[mask]
filtered_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
4,Eva,45,Seattle


In [42]:
mask

0    False
1    False
2     True
3    False
4     True
dtype: bool

### panda.Series.between()

In [43]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [44]:
filtered_df = df[df['Age'].between(30, 40)]
filtered_df

Unnamed: 0,Name,Age,City
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


In [45]:
mask = df['City'].isin(['Chicago', 'Houston'])
filtered_df = df[mask]
filtered_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
3,David,40,Houston


## Sorting Data

In [46]:
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Seattle']
}
df = pd.DataFrame(data)

In [47]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [49]:
sorted_df = df.sort_values(by='Age', ascending=False)
sorted_df

Unnamed: 0,Name,Age,City
4,Eva,45,Seattle
3,David,40,Houston
2,Charlie,35,Chicago
1,Bob,30,Los Angeles
0,Alice,25,New York


In [50]:
sorted_df = df.sort_values(by=['City', 'Age'])
sorted_df

Unnamed: 0,Name,Age,City
2,Charlie,35,Chicago
3,David,40,Houston
1,Bob,30,Los Angeles
0,Alice,25,New York
4,Eva,45,Seattle


In [51]:
sorted_df = df.sort_index()
sorted_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [52]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Seattle


In [53]:
df.sort_values(by='Age', ascending=False, inplace=True)
df

Unnamed: 0,Name,Age,City
4,Eva,45,Seattle
3,David,40,Houston
2,Charlie,35,Chicago
1,Bob,30,Los Angeles
0,Alice,25,New York


In [54]:
df

Unnamed: 0,Name,Age,City
4,Eva,45,Seattle
3,David,40,Houston
2,Charlie,35,Chicago
1,Bob,30,Los Angeles
0,Alice,25,New York


In [55]:
data = {
    'Player': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Score': [150, 200, 125, 300, 175]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Player,Score
0,Alice,150
1,Bob,200
2,Charlie,125
3,Diana,300
4,Eve,175


In [56]:
sorted_df = df.sort_values(by='Score', ascending=False).reset_index(drop=True)
sorted_df['Rank'] = sorted_df.index + 1
sorted_df

Unnamed: 0,Player,Score,Rank
0,Diana,300,1
1,Bob,200,2
2,Eve,175,3
3,Alice,150,4
4,Charlie,125,5


## Handling Missing Data

In [73]:
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'Age': [25, None, 35, 40],
    'City': ['New York', 'Los Angeles', None, 'Houston']
}
df = pd.DataFrame(data)

In [58]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,35.0,
3,Diana,40.0,Houston


In [59]:
df.isnull()

Unnamed: 0,Name,Age,City
0,False,False,False
1,False,True,False
2,False,False,True
3,False,False,False


In [60]:
df.isna()

Unnamed: 0,Name,Age,City
0,False,False,False
1,False,True,False
2,False,False,True
3,False,False,False


In [61]:
df_cleaned = df.dropna()
df_cleaned

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
3,Diana,40.0,Houston


In [62]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,,Los Angeles
2,Charlie,35.0,
3,Diana,40.0,Houston


In [63]:
df_cleaned = df.dropna(axis=1)
df_cleaned

Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie
3,Diana


In [72]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,Los Angeles
2,Charlie,35.0,
3,Diana,40.0,Houston


In [74]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [75]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,33.333333,Los Angeles
2,Charlie,35.0,
3,Diana,40.0,Houston


In [76]:
df['City'] = df['City'].fillna('Unknown')

In [77]:
df

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,33.333333,Los Angeles
2,Charlie,35.0,Unknown
3,Diana,40.0,Houston


In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      object 
 1   Age     4 non-null      float64
 2   City    4 non-null      object 
dtypes: float64(1), object(2)
memory usage: 228.0+ bytes


### Agregations and Grouping Data

In [79]:
import pandas as pd
# Sample DataFrame
data = {
    'Category': ['Electronics', 'Electronics', 'Furniture', 'Furniture', 'Apparel'],
    'Product': ['Laptop', 'Mouse', 'Chair', 'Table', 'Shoes'],
    'Sales': [1200, 100, 400, 300, 50]
}
df = pd.DataFrame(data)

In [80]:
df

Unnamed: 0,Category,Product,Sales
0,Electronics,Laptop,1200
1,Electronics,Mouse,100
2,Furniture,Chair,400
3,Furniture,Table,300
4,Apparel,Shoes,50


In [86]:
grouped = df.groupby('Category')['Sales'].sum().reset_index()
grouped

Unnamed: 0,Category,Sales
0,Apparel,50
1,Electronics,1300
2,Furniture,700


In [85]:
grouped = df.groupby(['Category', 'Product'])['Sales'].sum()
grouped

Category     Product
Apparel      Shoes        50
Electronics  Laptop     1200
             Mouse       100
Furniture    Chair       400
             Table       300
Name: Sales, dtype: int64

In [83]:
summary = df.groupby('Category')['Sales'].agg(['sum', 'mean', 'count'])
summary

Unnamed: 0_level_0,sum,mean,count
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apparel,50,50.0,1
Electronics,1300,650.0,2
Furniture,700,350.0,2
