In [4]:
# 🌟 Exercise 1: Getting & Knowing Your Data (chipo)

# 1. Import necessary libraries
import pandas as pd
import numpy as np

# 2. Retrieve the dataset and assign it to a variable 'chipo'
# Adding error handling and specifying potential fixes like on_bad_lines and encoding
try:
    chipo = pd.read_csv('data.csv', on_bad_lines='skip', encoding='utf-8')
except Exception as e:
    print(f"Error loading the CSV file: {e}")

# 3. Display the first 10 rows
print("First 10 rows of the 'chipo' dataset:")
print(chipo.head(10))

# 4. Determine the total number of entries (rows)
print("\nTotal number of entries (rows) in 'chipo':")
print(chipo.shape[0])

# 5. Find the total number of columns in 'chipo'
print("\nTotal number of columns in 'chipo':")
print(chipo.shape[1])

# 6. Print all column names
print("\nColumn names in 'chipo':")
print(chipo.columns)

# 7. Understand how the DataFrame is indexed
print("\nIndex of 'chipo':")
print(chipo.index)

# Check the column names
print("\nColumn names in 'chipo':")
print(chipo.columns)

# 8. Check if 'item_name' column exists
if 'item_name' in chipo.columns:
    # Find the most ordered item in 'item_name'
    print("\nMost ordered item:")
    print(chipo['item_name'].value_counts().head(1))
else:
    print("\n'item_name' column not found in the dataset.")

# 9. Find the total number of items ordered
# Check if 'quantity' column exists
if 'quantity' in chipo.columns:
    print("\nTotal number of items ordered:")
    print(chipo['quantity'].sum())
else:
    print("\n'quantity' column not found in the dataset.")

# 10. Find the most ordered item from 'choice_description'
# Check if 'choice_description' column exists
if 'choice_description' in chipo.columns:
    print("\nMost ordered item from 'choice_description':")
    print(chipo['choice_description'].value_counts().head(1))
else:
    print("\n'choice_description' column not found in the dataset.")

# 11. Convert 'item_price' to float datatype using apply and lambda
# Removing the '$' and converting to float
if 'item_price' in chipo.columns:
    chipo['item_price'] = chipo['item_price'].apply(lambda x: float(str(x).replace('$', '')))
else:
    print("\n'item_price' column not found in the dataset.")

# 12. Calculate the total revenue for the dataset
if 'item_price' in chipo.columns:
    print("\nTotal revenue:")
    print(chipo['item_price'].sum())

# 13. Find the total number of orders
if 'order_id' in chipo.columns:
    print("\nTotal number of unique orders:")
    print(chipo['order_id'].nunique())
else:
    print("\n'order_id' column not found in the dataset.")

# 14. Compute the average order value
if 'item_price' in chipo.columns and 'order_id' in chipo.columns:
    total_revenue = chipo['item_price'].sum()
    total_orders = chipo['order_id'].nunique()
    average_order_value = total_revenue / total_orders
    print("\nAverage order value:")
    print(average_order_value)

# 15. Determine the total number of unique items sold
if 'item_name' in chipo.columns:
    print("\nTotal number of unique items sold:")
    print(chipo['item_name'].nunique())


First 10 rows of the 'chipo' dataset:
  order_id\tquantity\titem_name\tchoice_description\titem_price
0   1\t1\tChips and Fresh Tomato Salsa\tNULL\t$2.39            
1                   1\t1\tIzze\t[Clementine]\t$3.39            
2            1\t1\tNantucket Nectar\t[Apple]\t$3.39            
3  1\t1\tChips and Tomatillo-Green Chili Salsa\tN...           
4                  3\t1\tSide of Chips\tNULL\t$1.69            
5            5\t1\tChips and Guacamole\tNULL\t$4.45            
6            7\t1\tChips and Guacamole\tNULL\t$4.45            
7  8\t1\tChips and Tomatillo-Green Chili Salsa\tN...           
8                9\t2\tCanned Soda\t[Sprite]\t$2.18            
9           10\t1\tChips and Guacamole\tNULL\t$4.45            

Total number of entries (rows) in 'chipo':
1795

Total number of columns in 'chipo':
1

Column names in 'chipo':
Index(['order_id\tquantity\titem_name\tchoice_description\titem_price'], dtype='object')

Index of 'chipo':
RangeIndex(start=0, stop=1795, step=

In [6]:
# 🌟 Exercise 2: Filtering & Sorting (chipo)

# Use the already created dataframe ‘chipo’ from the previous section

# 1. Check the column names to identify any discrepancies
print("Column names in 'chipo':")
print(chipo.columns)

# 2. Filter data for a specific condition (e.g., items with quantity greater than 10)
# Check if 'quantity' column exists
if 'quantity' in chipo.columns:
    filtered_data = chipo[chipo['quantity'] > 10]
    print("\nFiltered data (quantity > 10):")
    print(filtered_data.head())
else:
    print("\n'quantity' column not found in the dataset.")

# 3. Sort the data by a specific column (e.g., 'item_price')
# Check if 'item_price' column exists
if 'item_price' in chipo.columns:
    sorted_data = chipo.sort_values(by='item_price', ascending=False)
    print("\nData sorted by 'item_price':")
    print(sorted_data.head())
else:
    print("\n'item_price' column not found in the dataset.")

# 4. Filter data for a specific item (e.g., 'Chicken Bowl')
# Check if 'item_name' column exists
if 'item_name' in chipo.columns:
    chicken_bowl_data = chipo[chipo['item_name'] == 'Chicken Bowl']
    print("\nData for 'Chicken Bowl':")
    print(chicken_bowl_data.head())
else:
    print("\n'item_name' column not found in the dataset.")


Column names in 'chipo':
Index(['order_id\tquantity\titem_name\tchoice_description\titem_price'], dtype='object')

'quantity' column not found in the dataset.

'item_price' column not found in the dataset.

'item_name' column not found in the dataset.


In [11]:
import pandas as pd

# 1. Import necessary libraries
import pandas as pd

# 2. Retrieve the dataset and assign it to a variable 'users'
file_path = 'users.csv'  # Update this path to the correct location of your CSV file
try:
    users = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error loading the CSV file: [Errno 2] No such file or directory: '{file_path}'")
except Exception as e:
    print(f"An error occurred: {e}")

# 3. Check if the 'users' DataFrame was loaded successfully
if 'users' in locals():
    # 4. Display the first few rows to understand the structure
    print("\nFirst few rows of the 'users' dataset:")
    print(users.head())

    # 5. Print the column names to ensure they are as expected
    print("\nColumn names in 'users':")
    print(users.columns)

    # 6. Calculate the mean age per occupation
    if 'occupation' in users.columns and 'age' in users.columns:
        mean_age_per_occupation = users.groupby('occupation')['age'].mean()
        print("\nMean age per occupation:")
        print(mean_age_per_occupation)
    else:
        print("\n'occupation' or 'age' column not found in the dataset.")
else:
    print("\nThe 'users' DataFrame was not defined due to an error in loading the CSV file.")



First few rows of the 'users' dataset:
  user_id|age|gender|occupation|zip_code
0                1|24|M|technician|85711
1                     2|53|F|other|94043
2                    3|23|M|writer|32067
3                4|24|M|technician|43537
4                     5|33|F|other|15213

Column names in 'users':
Index(['user_id|age|gender|occupation|zip_code'], dtype='object')

'occupation' or 'age' column not found in the dataset.


In [1]:
import pandas as pd

# Create DataFrames
data1 = pd.DataFrame({
    'ID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David']
})

data2 = pd.DataFrame({
    'ID': [5, 6, 7, 8],
    'Name': ['Eve', 'Frank', 'Grace', 'Heidi']
})

data3 = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5, 6, 7, 8],
    'Score': [85, 92, 78, 88, 90, 79, 94, 84]
})

# Merge data1 and data2 along rows
all_data = pd.concat([data1, data2], axis=0, ignore_index=True)

# Merge data1 and data2 along columns
all_data_col = pd.concat([data1, data2], axis=1)

# Display the results
print("Merged along rows (all_data):")
print(all_data)

print("\nMerged along columns (all_data_col):")
print(all_data_col)


Merged along rows (all_data):
   ID     Name
0   1    Alice
1   2      Bob
2   3  Charlie
3   4    David
4   5      Eve
5   6    Frank
6   7    Grace
7   8    Heidi

Merged along columns (all_data_col):
   ID     Name  ID   Name
0   1    Alice   5    Eve
1   2      Bob   6  Frank
2   3  Charlie   7  Grace
3   4    David   8  Heidi


In [2]:
# 🌟 Exercise 5: Deleting (iris)

# Import necessary libraries
import pandas as pd

# Retrieve the dataset and assign it to ‘iris’
try:
    iris = pd.read_csv('iris.csv', on_bad_lines='skip', encoding='utf-8')
except Exception as e:
    print(f"Error loading the CSV file: {e}")

# Assign appropriate column names
iris.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

# Check if there are any missing values
print("\nMissing values in 'iris':")
print(iris.isnull().sum())



Missing values in 'iris':
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64


In [3]:
# 🌟 Exercise 6: Creating Series and DataFrames (pokemon)

# Import necessary libraries
import pandas as pd

# Create a data dictionary and convert it into a DataFrame
pokemon_data = {
    'evolution': ['Ivysaur', 'Charmeleon', 'Wartortle', 'Metapod'],
    'hp': [45, 39, 44, 45],
    'name': ['Bulbasaur', 'Charmander', 'Squirtle', 'Caterpie'],
    'pokedex': ['yes', 'no', 'yes', 'no'],
    'type': ['grass', 'fire', 'water', 'bug']
}
pokemon = pd.DataFrame(pokemon_data)

# Rearrange the columns in the order: ‘name’, ‘type’, ‘hp’, ‘evolution’, ‘pokedex’
pokemon = pokemon[['name', 'type', 'hp', 'evolution', 'pokedex']]
print("\nRearranged DataFrame 'pokemon':")
print(pokemon)



Rearranged DataFrame 'pokemon':
         name   type  hp   evolution pokedex
0   Bulbasaur  grass  45     Ivysaur     yes
1  Charmander   fire  39  Charmeleon      no
2    Squirtle  water  44   Wartortle     yes
3    Caterpie    bug  45     Metapod      no


In [4]:
# 🌟 Exercise 7: Descriptive Statistics (baby_names)

# Import necessary libraries
import pandas as pd

# Download and load the US Baby Names data into a DataFrame named baby_names
try:
    baby_names = pd.read_csv('baby_names.csv', on_bad_lines='skip', encoding='utf-8')
except Exception as e:
    print(f"Error loading the CSV file: {e}")

# Display the first 10 entries
print("\nFirst 10 entries of 'baby_names':")
print(baby_names.head(10))

# Delete the columns ‘Unnamed: 0’ and ‘Id’
baby_names = baby_names.drop(columns=['Unnamed: 0', 'Id'], errors='ignore')
print("\nDataFrame 'baby_names' after dropping columns:")
print(baby_names.head(10))



First 10 entries of 'baby_names':
   Unnamed: 0     Id      Name    Year Gender State  Count
0       11349  11350      Emma  2004.0      F    AK   62.0
1       11350  11351   Madison  2004.0      F    AK   48.0
2       11351  11352    Hannah  2004.0      F    AK   46.0
3       11352  11353     Grace  2004.0      F    AK   44.0
4       11353  11354     Emily  2004.0      F    AK   41.0
5       11354  11355   Abigail  2004.0      F    AK   37.0
6       11355  11356    Olivia  2004.0      F    AK   33.0
7       11356  11357  Isabella  2004.0      F    AK   30.0
8       11357  11358    Alyssa  2004.0      F    AK   29.0
9       11358  11359    Sophia  2004.0      F    AK   28.0

DataFrame 'baby_names' after dropping columns:
       Name    Year Gender State  Count
0      Emma  2004.0      F    AK   62.0
1   Madison  2004.0      F    AK   48.0
2    Hannah  2004.0      F    AK   46.0
3     Grace  2004.0      F    AK   44.0
4     Emily  2004.0      F    AK   41.0
5   Abigail  2004.0      F  

In [5]:
# 🌟 Exercise 8: Handling Time Series Data (investor_data)

# Import necessary libraries
import pandas as pd

# Create dataset using pd.date_range and specified columns
date_rng = pd.date_range(start='2021-01-01', end='2021-01-10', freq='D')
temperature = [30, 31, 29, 32, 33, 34, 28, 27, 25, 35]
humidity = [80, 75, 70, 85, 90, 80, 75, 60, 65, 70]
wind_speed = [5, 6, 7, 8, 9, 4, 10, 5, 6, 7]

weather_data = pd.DataFrame({
    'Date': date_rng,
    'Temperature': temperature,
    'Humidity': humidity,
    'Wind_Speed': wind_speed
})

# Determine the frequency of the dataset
freq = pd.infer_freq(weather_data['Date'])
print(f"\nThe frequency of the dataset is: {freq}")

# Set ‘Date’ as the index of the DataFrame
weather_data.set_index('Date', inplace=True)
print("\nDataFrame 'weather_data' with 'Date' as index:")
print(weather_data)



The frequency of the dataset is: D

DataFrame 'weather_data' with 'Date' as index:
            Temperature  Humidity  Wind_Speed
Date                                         
2021-01-01           30        80           5
2021-01-02           31        75           6
2021-01-03           29        70           7
2021-01-04           32        85           8
2021-01-05           33        90           9
2021-01-06           34        80           4
2021-01-07           28        75          10
2021-01-08           27        60           5
2021-01-09           25        65           6
2021-01-10           35        70           7
