In [1]:
import pandas as pd

# Sample data
data = {
    'Customer ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Postal code': ['12345', '67890', '54321', '1234A', '01000'],
    'Address': [
        '123 Main St', 
        '456 Oak St', 
        '789 Pine St', 
        '101 Maple St', 
        '202 Elm St'
    ],
    'City': ['City1', 'City2', 'City3', 'City4', 'City5'],
    'Country': ['USA', 'USA', 'USA', 'USA', 'USA']
}

# Create DataFrame
customers = pd.DataFrame(data)

# Display DataFrame
print(customers)


   Customer ID     Name Postal code       Address   City Country
0            1    Alice       12345   123 Main St  City1     USA
1            2      Bob       67890    456 Oak St  City2     USA
2            3  Charlie       54321   789 Pine St  City3     USA
3            4    David       1234A  101 Maple St  City4     USA
4            5      Eve       01000    202 Elm St  City5     USA


In [None]:
# Condition 1: Postal code length is 5
condition_1 = customers['Postal code'].str.len() == 5

# Condition 2: All characters of the postal code are numerical
condition_2 = customers['Postal code'].str.isnumeric()

# Condition 3: Postal code is between '01000' and '96000'
condition_3 = ('01000' < customers['Postal code']) & (customers['Postal code'] < '96000')

# Check if all conditions are met across all rows
all_conditions_met = (condition_1 & condition_2 & condition_3).sum() == customers.shape[0]

print("All conditions met:", all_conditions_met)


In [2]:
import pandas as pd

# Sample data
data = {
    'Customer ID': [1, 2, 3, 4, 5, 6, 7],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace'],
    'Postal code': ['12345', '67890', '54321', '1234A', '01000', '67891', '54322'],
    'Address': [
        '123 Main St', 
        '456 Oak St', 
        '789 Pine St', 
        '101 Maple St', 
        '202 Elm St',
        '303 Birch St',
        '404 Cedar St'
    ],
    'City': ['RENES', 'PERPINAN', 'PARIS LA DÉFENSE', 'PARIS', 'LYON', 'MARSEILLE', 'NICE'],
    'Country': ['France', 'France', 'France', 'France', 'France', 'France', 'France']
}

# Create DataFrame
customers = pd.DataFrame(data)

# Display unique cities before replacement
print("Unique cities before replacement:", customers['City'].unique())

# Replace misspelled city names
customers.replace({'RENES': 'RENNES', 
                   'PERPINAN': 'PERPIGNAN',
                   'PARIS LA DÉFENSE': 'PARIS'},
                  inplace=True)

# Display unique cities after replacement
print("Unique cities after replacement:", customers['City'].unique())


Unique cities before replacement: ['RENES' 'PERPINAN' 'PARIS LA DÉFENSE' 'PARIS' 'LYON' 'MARSEILLE' 'NICE']
Unique cities after replacement: ['RENNES' 'PERPIGNAN' 'PARIS' 'LYON' 'MARSEILLE' 'NICE']


In [7]:
df = pd.DataFrame({ 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
                     'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
                     'rating': [4, 4, 3.5, 15, 5] })

df

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
1,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [14]:
print("Number of duplicates:", df.duplicated().sum())

Number of duplicates: 1


In [4]:
df.drop_duplicates()

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [5]:
df.drop_duplicates(subset=['brand', 'style'], keep='last')

Unnamed: 0,brand,style,rating
1,Yum Yum,cup,4.0
2,Indomie,cup,3.5
4,Indomie,pack,5.0


In [15]:
pd.to_datetime('2018-10-26 12:00:00',
               format='%Y-%m-%d %H:%M:%S')

Timestamp('2018-10-26 12:00:00')

In [20]:
pd.to_datetime([1, 2, 3], unit='D',
            origin=pd.Timestamp('1960-01-01'))


DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)

In [19]:

pd.to_datetime([0, 1, 2], unit='D',
            origin=pd.Timestamp('1960-01-01'))

DatetimeIndex(['1960-01-01', '1960-01-02', '1960-01-03'], dtype='datetime64[ns]', freq=None)

In [21]:
from datetime import timedelta
delta = timedelta(
    days=50,
    seconds=27,
    microseconds=10,
    milliseconds=29000,
    minutes=5,
    hours=8,
    weeks=2
)
# Only days, seconds, and microseconds remain
delta

datetime.timedelta(days=64, seconds=29156, microseconds=10)

In [24]:
import pandas as pd
from datetime import datetime as dt

# Sample data
data = {
    'Order ID': [1, 2, 3, 4, 5],
    'Product': ['Laptop', 'Phone', 'Tablet', 'Monitor', 'Keyboard'],
    'Order date': ['01/15/17', '03/22/17', '05/10/17', '07/18/17', '09/25/17'],
    'Delivery date': ['2017-01-20', '2017-03-27', '2017-05-15', '2017-07-23', '2017-09-30'],
    'Quantity': [1, 2, 3, 1, 5],
    'Price': [1200, 800, 300, 200, 50]
}

# Create DataFrame
Sales_2017 = pd.DataFrame(data)

# Apply the transformations
Sales_2017['Order date'] = Sales_2017['Order date'].apply(lambda x: dt.strptime(x, "%m/%d/%y"))     # new datetime parsed from a string
Sales_2017['Delivery date'] = Sales_2017['Delivery date'].apply(lambda x: dt.strptime(x, "%Y-%m-%d"))

# Display the DataFrame
print(Sales_2017)


   Order ID   Product Order date Delivery date  Quantity  Price
0         1    Laptop 2017-01-15    2017-01-20         1   1200
1         2     Phone 2017-03-22    2017-03-27         2    800
2         3    Tablet 2017-05-10    2017-05-15         3    300
3         4   Monitor 2017-07-18    2017-07-23         1    200
4         5  Keyboard 2017-09-25    2017-09-30         5     50


In [25]:
import pandas as pd

# Sample data for Sales_2017
data_2017 = {
    'Order ID': [1, 2, 3, 4, 5],
    'Product Id': ['P001', 'P002', 'P003', 'P004', 'P005'],
    'Product': ['Laptop', 'Phone', 'Tablet', 'Monitor', 'Keyboard'],
    'Order date': ['01/15/17', '03/22/17', '05/10/17', '07/18/17', '09/25/17'],
    'Quantity': [1, 2, 3, 1, 5],
    'Price': [1200, 800, 300, 200, 50]
}

# Sample data for Sales_2018
data_2018 = {
    'Order ID': [6, 7, 8, 9, 10],
    'Product Id': ['P002', 'P003', 'P006', 'P007', 'P008'],
    'Product': ['Phone', 'Tablet', 'Headphones', 'Charger', 'Mouse'],
    'Order date': ['02/15/18', '04/22/18', '06/10/18', '08/18/18', '10/25/18'],
    'Quantity': [1, 2, 3, 1, 5],
    'Price': [850, 320, 150, 20, 25]
}

# Create DataFrames
Sales_2017 = pd.DataFrame(data_2017)
Sales_2018 = pd.DataFrame(data_2018)

# Find the difference in product IDs
A = set(Sales_2017['Product Id'])
B = set(Sales_2018['Product Id'])
new_products_2018 = B - A

print("Product IDs present in 2018 but not in 2017:", new_products_2018)


Product IDs present in 2018 but not in 2017: {'P008', 'P007', 'P006'}
