In [3]:
import pandas as pd


# Q1. List any five functions of the pandas library with execution.

In [4]:

# Create a DataFrame
data = {'A': [1, 2, 3],
        'B': [4, 5, 6]}
df = pd.DataFrame(data)

# Functions:
print(df.head())     # Display the first 5 rows of the DataFrame
print(df.info())     # Display information about the DataFrame
print(df.describe()) # Generate summary statistics of the DataFrame
print(df.shape)      # Get the shape of the DataFrame (number of rows and columns)
print(df.columns)    # Get the column names of the DataFrame


   A  B
0  1  4
1  2  5
2  3  6
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
dtypes: int64(2)
memory usage: 112.0 bytes
None
         A    B
count  3.0  3.0
mean   2.0  5.0
std    1.0  1.0
min    1.0  4.0
25%    1.5  4.5
50%    2.0  5.0
75%    2.5  5.5
max    3.0  6.0
(3, 2)
Index(['A', 'B'], dtype='object')


# Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the DataFrame with a new index that starts from 1 and increments by 2 for each row.

In [5]:

def reindex_dataframe(df):
    new_index = range(1, len(df) * 2 + 1, 2)
    df_reindexed = df.copy()
    df_reindexed.index = new_index
    return df_reindexed

# Example usage:
data = {'A': [1, 2, 3],
        'B': [4, 5, 6],
        'C': [7, 8, 9]}
df = pd.DataFrame(data)
df_reindexed = reindex_dataframe(df)
print(df_reindexed)


   A  B  C
1  1  4  7
3  2  5  8
5  3  6  9


# Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The function should print the sum to the console.
For example, if the 'Values' column of df contains the values [10, 20, 30, 40, 50], your function should
calculate and print the sum of the first three values, which is 60.

In [6]:

def sum_first_three(df):
    values_column = df['Values']
    sum_first_three_values = sum(values_column.head(3))
    print("Sum of the first three values:", sum_first_three_values)

# Example usage:
data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)
sum_first_three(df)


Sum of the first three values: 60


# Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column 'Word_Count' that contains the number of words in each row of the 'Text' column.

In [7]:

def add_word_count(df):
    df['Word_Count'] = df['Text'].apply(lambda x: len(x.split()))
    return df

# Example usage:
data = {'Text': ['This is a sample text.', 'Another example.', 'Just a few words.']}
df = pd.DataFrame(data)
df_with_word_count = add_word_count(df)
print(df_with_word_count)


                     Text  Word_Count
0  This is a sample text.           5
1        Another example.           2
2       Just a few words.           4


# Q5. How are DataFrame.size() and DataFrame.shape() different?

DataFrame.size returns the total number of elements in the DataFrame (rows * columns).
DataFrame.shape returns a tuple representing the dimensions of the DataFrame (rows, columns).

# Q6. Which function of pandas do we use to read an excel file?

In [None]:

# Read an Excel file into a DataFrame
df = pd.read_excel('file.xlsx')

# Example usage:
print(df)


# Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email addresses in the format 'username@domain.com'. 
Write a Python function that creates a new column
'Username' in df that contains only the username part of each email address.
The username is the part of the email address that appears before the '@' symbol. For example, if the
email address is 'john.doe@example.com', the 'Username' column should contain 'john.doe'. Your
function should extract the username from each email address and store it in the new 'Username'
column.

In [8]:

def extract_username(df):
    df['Username'] = df['Email'].apply(lambda x: x.split('@')[0])
    return df

# Example usage:
data = {'Email': ['john.doe@example.com', 'jane.smith@example.com']}
df = pd.DataFrame(data)
df_with_username = extract_username(df)
print(df_with_username)


                    Email    Username
0    john.doe@example.com    john.doe
1  jane.smith@example.com  jane.smith


# Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'.
Write a Python function that selects
all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The
function should return a new DataFrame that contains only the selected rows.
For example, if df contains the following values:
A B C
0 3 5 1
1 8 2 7
2 6 9 4
3 2 3 5
4 9 1 2
Your function should select the following rows: A B C
1 8 2 7
4 9 1 2
The function should return a new DataFrame that contains only the selected rows.

In [9]:

def select_rows(df):
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]
    return selected_rows

# Example usage:
data = {'A': [3, 8, 6, 2, 9],
        'B': [5, 2, 9, 3, 1],
        'C': [1, 7, 4, 5, 2]}
df = pd.DataFrame(data)
selected_df = select_rows(df)
print(selected_df)


   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


# Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean, median, and standard deviation of the values in the 'Values' column.

In [10]:

def calculate_statistics(df):
    mean_value = df['Values'].mean()
    median_value = df['Values'].median()
    std_deviation = df['Values'].std()
    return mean_value, median_value, std_deviation

# Example usage:
data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)
mean, median, std = calculate_statistics(df)
print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std)


Mean: 30.0
Median: 30.0
Standard Deviation: 15.811388300841896


# Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days for each row in the DataFrame. The moving average should be calculated using a window of size 7 and should include the current day.

In [11]:

def calculate_moving_average(df, window_size=7):
    df['MovingAverage'] = df['Sales'].rolling(window=window_size, min_periods=1).mean()
    return df

# Example usage:
data = {'Date': pd.date_range(start='2023-01-01', periods=10),
        'Sales': [100, 120, 130, 110, 150, 160, 140, 130, 170, 180]}
df = pd.DataFrame(data)
df_with_moving_average = calculate_moving_average(df)
print(df_with_moving_average)


        Date  Sales  MovingAverage
0 2023-01-01    100     100.000000
1 2023-01-02    120     110.000000
2 2023-01-03    130     116.666667
3 2023-01-04    110     115.000000
4 2023-01-05    150     122.000000
5 2023-01-06    160     128.333333
6 2023-01-07    140     130.000000
7 2023-01-08    130     134.285714
8 2023-01-09    170     141.428571
9 2023-01-10    180     148.571429


# Q11. You have a Pandas DataFrame df with a column 'Date'. 
Write a Python function that creates a new
column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g.
Monday, Tuesday) corresponding to each date in the 'Date' column.
For example, if df contains the following values:
Date
0 2023-01-01
1 2023-01-02
2 2023-01-03
3 2023-01-04
4 2023-01-05
Your function should create the following DataFrame:

Date Weekday
0 2023-01-01 Sunday
1 2023-01-02 Monday
2 2023-01-03 Tuesday
3 2023-01-04 Wednesday
4 2023-01-05 Thursday
The function should return the modified DataFrame.

In [12]:

def add_weekday_column(df):
    df['Weekday'] = df['Date'].dt.strftime('%A')
    return df

# Example usage:
data = {'Date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05'])}
df = pd.DataFrame(data)
df_with_weekday = add_weekday_column(df)
print(df_with_weekday)


        Date    Weekday
0 2023-01-01     Sunday
1 2023-01-02     Monday
2 2023-01-03    Tuesday
3 2023-01-04  Wednesday
4 2023-01-05   Thursday


# Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python function to select all rows where the date is between '2023-01-01' and '2023-01-31'.

In [13]:

def select_rows_between_dates(df, start_date, end_date):
    mask = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    selected_rows = df[mask]
    return selected_rows

# Example usage:
data = {'Date': pd.to_datetime(['2023-01-15', '2023-01-20', '2023-02-05', '2023-01-10'])}
df = pd.DataFrame(data)
start_date = '2023-01-01'
end_date = '2023-01-31'
selected_df = select_rows_between_dates(df, start_date, end_date)
print(selected_df)


        Date
0 2023-01-15
1 2023-01-20
3 2023-01-10


# Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to be imported?



In [14]:
import pandas as pd
