read_csv(): This function is used to read data from a CSV file into a DataFrame.

In [3]:
import pandas as pd

# Read data from a CSV file into a DataFrame
df = pd.read_csv("amazon_products.csv")


head(): This function is used to display the first few rows of a DataFrame.

In [5]:
# Display the first 5 rows of the DataFrame
print(df.head(5))

                                        Product Name              Rating  \
0  VW 80 cm (32 inches) Frameless Series HD Ready...  3.8 out of 5 stars   
1  VW 80 cm (32 inches) Frameless Series HD Ready...  3.8 out of 5 stars   
2  MI 80 cm (32 inches) A Series HD Ready Smart G...  4.2 out of 5 stars   
3  Samsung 163 cm (65 inches) 4K Ultra HD Smart Q...  4.3 out of 5 stars   
4  Redmi 80 cm (32 inches) F Series HD Ready Smar...  4.2 out of 5 stars   

   Customers_Rated      Price  
0            10621   Rs 7,499  
1            10621   Rs 7,499  
2            56425  Rs 12,499  
3                3  Rs 99,990  
4            60773  Rs 12,999  


info(): This function is used to get a concise summary of the DataFrame, including the data types of each column and the number of non-null values.

In [6]:
# Get concise summary of the DataFrame
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Product Name     34 non-null     object
 1   Rating           34 non-null     object
 2   Customers_Rated  34 non-null     int64 
 3   Price            34 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.2+ KB
None


describe(): This function generates descriptive statistics of the DataFrame, such as count, mean, standard deviation, minimum, maximum, and quartile values.

In [7]:
# Generate descriptive statistics of the DataFrame
print(df.describe())


       Customers_Rated
count        34.000000
mean       9565.823529
std       16299.618442
min           2.000000
25%         165.750000
50%        3875.000000
75%       11389.000000
max       60773.000000


groupby(): This function is used to group data in a DataFrame based on specified columns, which allows for aggregation and analysis of data within each group.

In [1]:
import pandas as pd

def reindex_with_incrementing_index(df):
    # Creating a new index starting from 1 and incrementing by 2
    new_index = pd.Index(range(1, len(df) * 2, 2), name='NewIndex')
    # Assigning the new index to the DataFrame
    df.index = new_index
    return df

# Example usage:
# Assuming df is your DataFrame with columns 'A', 'B', and 'C'
df = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [5, 6, 7, 8],
    'C': [9, 10, 11, 12]
})

# Reindexing the DataFrame
df = reindex_with_incrementing_index(df)
print(df)


          A  B   C
NewIndex          
1         1  5   9
3         2  6  10
5         3  7  11
7         4  8  12


In [2]:
import pandas as pd

def sum_first_three_values(df):
    # Initialize sum
    total_sum = 0
    
    # Iterate over the first three values in the 'Values' column
    for value in df['Values'].iloc[:3]:  # iloc[:3] selects the first three values
        total_sum += value
    
    # Print the sum to the console
    print("Sum of the first three values:", total_sum)

# Example usage:
# Assuming df is your DataFrame with a column named 'Values'
df = pd.DataFrame({'Values': [1, 2, 3, 4, 5]})

# Calling the function
sum_first_three_values(df)


Sum of the first three values: 6


In [3]:
import pandas as pd

def add_word_count_column(df):
    # Function to count the number of words in each row
    word_count = lambda text: len(text.split())
    
    # Apply the function to each row in the 'Text' column and create a new column 'Word_Count'
    df['Word_Count'] = df['Text'].apply(word_count)
    
    return df

# Example usage:
# Assuming df is your DataFrame with a column named 'Text'
df = pd.DataFrame({'Text': ['This is a sentence.', 'Another sentence here.', 'A third sentence.']})

# Adding the 'Word_Count' column
df = add_word_count_column(df)
print(df)


                     Text  Word_Count
0     This is a sentence.           4
1  Another sentence here.           3
2       A third sentence.           3


DataFrame.size:

DataFrame.size returns the total number of elements (cells) in the DataFrame, which is equal to the product of the number of rows and the number of columns.
It calculates the size by multiplying the number of rows (DataFrame.shape[0]) by the number of columns (DataFrame.shape[1]).
The size attribute returns a single integer value representing the total number of elements in the DataFrame.
DataFrame.shape:

DataFrame.shape returns a tuple representing the dimensions of the DataFrame.
It returns a tuple in the format (number of rows, number of columns), indicating the number of rows and the number of columns in the DataFrame.
The shape attribute returns a tuple with two elements, providing information about the DataFrame's structure.


In [5]:
import pandas as pd

# Read data from an Excel file into a DataFrame
df = pd.read_excel('LUSID Excel - Setting up your market data.xlsx')

# Display the DataFrame
print(df)


    Unnamed: 0  Unnamed: 1  Unnamed: 2  \
0          NaN         NaN         NaN   
1          NaN         NaN         NaN   
2          NaN         NaN         NaN   
3          NaN         NaN         NaN   
4          NaN         NaN         NaN   
5          NaN         NaN         NaN   
6          NaN         NaN         NaN   
7          NaN         NaN         NaN   
8          NaN         NaN         NaN   
9          NaN         NaN         NaN   
10         NaN         NaN         NaN   
11         NaN         NaN         NaN   
12         NaN         NaN         NaN   
13         NaN         NaN         NaN   
14         NaN         NaN         NaN   
15         NaN         NaN         NaN   
16         NaN         NaN         NaN   
17         NaN         NaN         NaN   
18         NaN         NaN         NaN   
19         NaN         NaN         NaN   
20         NaN         NaN         NaN   
21         NaN         NaN         NaN   
22         NaN         NaN        

In [6]:
import pandas as pd

def extract_username(df):
    # Splitting the 'Email' column at '@' symbol and extracting the first part (username)
    df['Username'] = df['Email'].str.split('@').str[0]
    return df

# Example usage:
# Assuming df is your DataFrame with a column named 'Email'
df = pd.DataFrame({'Email': ['john.doe@example.com', 'jane.smith@example.com']})

# Calling the function to extract usernames
df = extract_username(df)
print(df)


                    Email    Username
0    john.doe@example.com    john.doe
1  jane.smith@example.com  jane.smith


In [1]:
import pandas as pd

def select_rows(df):
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]
    return selected_rows

# Example usage:
# Assuming df is your DataFrame with columns 'A', 'B', and 'C'
df = pd.DataFrame({
    'A': [3, 8, 6, 2, 9],
    'B': [5, 2, 9, 3, 1],
    'C': [1, 7, 4, 5, 2]
})

selected_df = select_rows(df)
print(selected_df)


   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


In [2]:
import pandas as pd

def calculate_stats(df):
    stats = {
        'Mean': df['Values'].mean(),
        'Median': df['Values'].median(),
        'Standard Deviation': df['Values'].std()
    }
    return stats

# Example usage:
# Assuming df is your DataFrame with a column 'Values'
df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})

result = calculate_stats(df)
print(result)


{'Mean': 30.0, 'Median': 30.0, 'Standard Deviation': 15.811388300841896}


In [3]:
{'Mean': 30.0, 'Median': 30.0, 'Standard Deviation': 15.811388300841896}


{'Mean': 30.0, 'Median': 30.0, 'Standard Deviation': 15.811388300841896}

In [4]:
import pandas as pd

def add_weekday_column(df):
    # Convert the 'Date' column to datetime format
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Extract the weekday name and add it as a new column 'Weekday'
    df['Weekday'] = df['Date'].dt.strftime('%A')
    
    return df

# Example usage:
# Assuming df is your DataFrame with a column 'Date'
df = pd.DataFrame({'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']})

# Add the 'Weekday' column
df = add_weekday_column(df)
print(df)


        Date    Weekday
0 2023-01-01     Sunday
1 2023-01-02     Monday
2 2023-01-03    Tuesday
3 2023-01-04  Wednesday
4 2023-01-05   Thursday


In [6]:
import pandas as pd

def select_rows_between_dates(df):
    # Convert the 'Date' column to datetime format
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Define the start and end dates for filtering
    start_date = '2023-01-01'
    end_date = '2023-01-31'
    
    # Select rows where the date is between start_date and end_date
    selected_rows = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    
    return selected_rows

# Example usage:
# Assuming df is your DataFrame with a column 'Date'
df = pd.DataFrame({'Date': ['2023-01-01', '2023-01-15', '2023-01-31', '2023-02-05', '2023-02-10']})

# Select rows between '2023-01-01' and '2023-01-31'
selected_df = select_rows_between_dates(df)
print(selected_df)


        Date
0 2023-01-01
1 2023-01-15
2 2023-01-31


In [7]:
import pandas as pd
