# Q1. List any five functions of the pandas library with execution.

A1. here are five common functions of the Pandas library with examples:


1. head(): This function is used to display the first few rows of a DataFrame.

In [1]:
import pandas as pd

# create a DataFrame
data = {'Name': ['John', 'Anna', 'Peter', 'Linda', 'Sophie'],
        'Age': [28, 25, 33, 42, 19],
        'Country': ['USA', 'UK', 'Germany', 'France', 'Canada']}
df = pd.DataFrame(data)

# display the first three rows of the DataFrame
print(df.head(3))


    Name  Age  Country
0   John   28      USA
1   Anna   25       UK
2  Peter   33  Germany


2. merge(): This function is used to merge two DataFrames based on a common column.

In [2]:
import pandas as pd

# create two DataFrames
df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'],
                    'value': [1, 2, 3, 4]})
df2 = pd.DataFrame({'key': ['B', 'D', 'E', 'F'],
                    'value': [5, 6, 7, 8]})

# merge the DataFrames based on the 'key' column
merged_df = pd.merge(df1, df2, on='key')

# print the result
print(merged_df)


  key  value_x  value_y
0   B        2        5
1   D        4        6


3. describe(): This function is used to generate descriptive statistics of a DataFrame.

In [3]:
import pandas as pd

# create a DataFrame
data = {'Name': ['John', 'Anna', 'Peter', 'Linda', 'Sophie'],
        'Age': [28, 25, 33, 42, 19],
        'Country': ['USA', 'UK', 'Germany', 'France', 'Canada']}
df = pd.DataFrame(data)

# generate descriptive statistics of the DataFrame
print(df.describe())


             Age
count   5.000000
mean   29.400000
std     8.677557
min    19.000000
25%    25.000000
50%    28.000000
75%    33.000000
max    42.000000


4. groupby(): This function is used to group a DataFrame by one or more columns and perform some analysis on each group.

In [4]:
import pandas as pd

# create a DataFrame with employee data
data = {'Name': ['John', 'Anna', 'Peter', 'Linda', 'Sophie'],
        'Age': [28, 25, 33, 42, 19],
        'Country': ['USA', 'UK', 'Germany', 'France', 'Canada'],
        'Salary': [60000, 45000, 55000, 70000, 50000]}
df = pd.DataFrame(data)

# calculate the average salary for each country
avg_salary_by_country = df.groupby('Country')['Salary'].mean()

# print the result
print(avg_salary_by_country)


Country
Canada     50000.0
France     70000.0
Germany    55000.0
UK         45000.0
USA        60000.0
Name: Salary, dtype: float64


5. fillna(): This function is used to fill missing values in a DataFrame with a specified value.

In [6]:
import pandas as pd


# create a DataFrame with missing values
data = {'Name': ['John', 'Anna', 'Peter', 'Linda', 'Sophie'],
        'Age': [28, np.nan, 33, 42, 19],
        'Country': ['USA', 'UK', None, 'France', 'Canada'],
        'Salary': [60000, 45000, None, 70000, None]}
df = pd.DataFrame(data)

# fill missing values with 0
df.fillna(0, inplace=True)

# print the result
print(df)


     Name   Age Country   Salary
0    John  28.0     USA  60000.0
1    Anna   0.0      UK  45000.0
2   Peter  33.0       0      0.0
3   Linda  42.0  France  70000.0
4  Sophie  19.0  Canada      0.0


# Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the DataFrame with a new index that starts from 1 and increments by 2 for each row.

In [1]:
import pandas as pd

def reindex_dataframe(df):
    # get the number of rows in the DataFrame
    num_rows = df.shape[0]

    new_index= list(range(1,num_rows*2,2))

    # create a new index starting from 1 and incrementing by 2
    #new_index = pd.RangeIndex(start=1, stop=num_rows*2, step=2)

    # reindexing with new index values
    df=df.reindex(list(new_index))
    return df

# create a datafrome
data = {'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8], 'C': [9, 10, 11, 12]}
df = pd.DataFrame(data)
df_reindex = reindex_dataframe(df)
# print the result
print(df_reindex)

     A    B     C
1  2.0  6.0  10.0
3  4.0  8.0  12.0
5  NaN  NaN   NaN
7  NaN  NaN   NaN


# Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The function should print the sum to the console.
# For example, if the 'Values' column of df contains the values [10, 20, 30, 40, 50], your function should calculate and print the sum of the first three values, which is 60.

In [51]:
import pandas as pd

def sum_first_three_values(df):    
    addition = df.head(3)['Values'].sum()    
    print(f"Sum of first three value is {addition}")

# create a datafrome
data = {'Values': [10, 20, 30, 40,50]}
df = pd.DataFrame(data)
sum_first_three_values(df)

Sum of first three value is 60


# Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column 'Word_Count' that contains the number of words in each row of the 'Text' column.

In [52]:
import pandas as pd

def add_word_count_column(df):
    # split each row of the 'Text' column by whitespace and count the resulting list
    word_count = df['Text'].str.split().str.len()
    # add the 'Word_Count' column to the DataFrame
    df['Word_Count'] = word_count
    # return the updated DataFrame
    return df

    
# create a datafrome
data ={'Text': ['Working with Pandas is fun!', 'Dheer Singh Rajpoot', 'I Love My India', 'Linda Pinda', 'Sophie']}
df = pd.DataFrame(data)

# add the 'Word_Count' column using the function
df = add_word_count_column(df)

# print the result
print(df)

                          Text  Word_Count
0  Working with Pandas is fun!           5
1          Dheer Singh Rajpoot           3
2              I Love My India           4
3                  Linda Pinda           2
4                       Sophie           1


# Q5. How are DataFrame.size() and DataFrame.shape() different?

A5. In Pandas, DataFrame.size() and DataFrame.shape() are two different methods with different functionalities.

The DataFrame.size() method returns the number of elements in the DataFrame, which is equal to the product of the number of rows and the number of columns in the DataFrame. In other words, DataFrame.size() returns the total number of values in the DataFrame.

On the other hand, the DataFrame.shape() method returns a tuple of two values: the number of rows and the number of columns in the DataFrame. The first value in the tuple represents the number of rows in the DataFrame, while the second value represents the number of columns in the DataFrame.

Here's an example to illustrate the difference between these two methods:

In [53]:
import pandas as pd

# create a sample DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# print the size of the DataFrame
print("Size of the DataFrame:", df.size)
# Output: Size of the DataFrame: 6

# print the shape of the DataFrame
print("Shape of the DataFrame:", df.shape)
# Output: Shape of the DataFrame: (3, 2)


Size of the DataFrame: 6
Shape of the DataFrame: (3, 2)


In this example, the DataFrame df has 6 elements (i.e., 2 columns x 3 rows), so the output of df.size is 6. The output of df.shape is a tuple of two values: 3 (the number of rows in df) and 2 (the number of columns in df).

# Q6. Which function of pandas do we use to read an excel file?

A6. To read an Excel file in Pandas, we can use the read_excel() function.

Here's an example:

In [None]:
import pandas as pd

# read the Excel file into a DataFrame
df = pd.read_excel('example.xlsx')

# print the first few rows of the DataFrame
print(df.head())

In this example, the read_excel() function reads an Excel file named example.xlsx and stores its contents in a Pandas DataFrame called df. We can then use various DataFrame methods and functions to manipulate and analyze the data in the Excel file.

# Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email addresses in the format 'username@domain.com'. Write a Python function that creates a new column 'Username' in df that contains only the username part of each email address.
# The username is the part of the email address that appears before the '@' symbol. For example, if the email address is 'john.doe@example.com', the 'Username' column should contain 'john.doe'. Your function should extract the username from each email address and store it in the new 'Username' column.

In [68]:
import pandas as pd

def add_username_column(df):
    # split each row of the 'Text' column by @
    user_name = df['Email'].str.split('@').str[0]
    
    # add the 'Username' column to the DataFrame
    df['Username'] = user_name
    # return the updated DataFrame
    return df

    
# create a datafrome
data ={'Email': ['drajpo@sapient.com', 'drajpoot@infosys.com', 'ybisen@optum.com', 'kbisen@uhg.com', 'abisen@lawyer.com', 'john.doe@example.com']}
df = pd.DataFrame(data)

# add the 'User Name' column using the function
df = add_username_column(df)

# print the result
print(df)

                  Email  Username
0    drajpo@sapient.com    drajpo
1  drajpoot@infosys.com  drajpoot
2      ybisen@optum.com    ybisen
3        kbisen@uhg.com    kbisen
4     abisen@lawyer.com    abisen
5  john.doe@example.com  john.doe


![image.png](attachment:918248fb-7556-4397-a1de-c8f8931d9724.png)

In [15]:
import pandas as pd

def filter_dataframe(df):
    return df[ (df['A']>5 )  & (df['B'] < 10) ]    
    
# create a datafrome
df = pd.DataFrame({'A': [3, 8, 6, 2 ,9], 'B': [5, 2, 9, 3, 1], 'C':[1, 7, 4, 5, 2]})

# filter dataframe using the function
df = filter_dataframe(df)

# print the result
print(df)

   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


# Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean, median, and standard deviation of the values in the 'Values' column.

In [20]:
import pandas as pd

def calculate_stats(df, column):
    mean = df[column].mean()
    median = df[column].median()
    std = df[column].std()
    return mean, median, std

# create a sample DataFrame
df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})

# call the function to calculate stats
mean, median, std = calculate_stats(df, 'Values')

# print the results
print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std)


Mean: 30.0
Median: 30.0
Standard Deviation: 15.811388300841896


# Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days for each row in the DataFrame. The moving average should be calculated using a window of size 7 and should include the current day.

To calculate the moving average, we can use the rolling() method of Pandas DataFrame, which creates a rolling window of a given size and applies a specified function to the data within that window. We can use the rolling() method to create a rolling window of size 7 and apply the mean() function to calculate the moving average.

Here is the Python function to create the 'MovingAverage' column:

In [21]:
import pandas as pd

def calculate_moving_average(df):
    df['MovingAverage'] = df['Sales'].rolling(window=7, min_periods=1).mean()
    return df


This function takes a Pandas DataFrame as input and returns the same DataFrame with a new column 'MovingAverage' that contains the moving average of the 'Sales' column. We use the rolling() method with the window parameter set to 7 to create a rolling window of size 7, and the min_periods parameter set to 1 to include the current day. We then apply the mean() function to calculate the moving average and store it in the 'MovingAverage' column.

To use this function, we can create a sample DataFrame and call the function like this:

In [22]:
# create a sample DataFrame
df = pd.DataFrame({'Date': pd.date_range('2022-01-01', periods=14), 'Sales': [10, 20, 15, 30, 25, 35, 40, 50, 45, 55, 60, 50, 40, 30]})

# calculate the moving average
df = calculate_moving_average(df)

# print the result
print(df)


         Date  Sales  MovingAverage
0  2022-01-01     10      10.000000
1  2022-01-02     20      15.000000
2  2022-01-03     15      15.000000
3  2022-01-04     30      18.750000
4  2022-01-05     25      20.000000
5  2022-01-06     35      22.500000
6  2022-01-07     40      25.000000
7  2022-01-08     50      30.714286
8  2022-01-09     45      34.285714
9  2022-01-10     55      40.000000
10 2022-01-11     60      44.285714
11 2022-01-12     50      47.857143
12 2022-01-13     40      48.571429
13 2022-01-14     30      47.142857


![image.png](attachment:27efbd1e-0604-45ac-9d9d-60e6702e9a76.png)

In [33]:
import pandas as pd
import datetime

def add_weekday_column(df):

    weekday = pd.to_datetime(df['Date']).dt.day_name()
    
    # add the 'Weekday column to the DataFrame
    df['Weekday'] = weekday
    
    # return the updated DataFrame
    return df

    
# create a datafrome
data ={'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']}
df = pd.DataFrame(data)

# add the 'Weekday' column using the function
df = add_weekday_column(df)

# print the result
print(df)

         Date    Weekday
0  2023-01-01     Sunday
1  2023-01-02     Monday
2  2023-01-03    Tuesday
3  2023-01-04  Wednesday
4  2023-01-05   Thursday


# Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python function to select all rows where the date is between '2023-01-01' and '2023-01-31'.

In [39]:
import pandas as pd

def selectrows(df):
    # convert Timestamp column to datetime format
    df['Date'] = pd.to_datetime(df['Date'])
    # select rows where date is between '2023-01-01' and '2023-01-31'
    start_date = pd.to_datetime('2023-01-01')
    end_date = pd.to_datetime('2023-01-31')
    selected_rows = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    return selected_rows


# create a sample DataFrame with timestamps
df = pd.DataFrame({
    'Date': [
        '2023-04-01 10:00:00',
        '2023-01-05 12:00:00',
        '2023-01-10 14:00:00',
        '2023-02-15 16:00:00',
        '2023-03-20 18:00:00'
    ]    
})
df = selectrows(df)
print(df)

                 Date
1 2023-01-05 12:00:00
2 2023-01-10 14:00:00


# Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to be imported?

To use the basic functions of pandas, the first and foremost necessary library that needs to be imported is the pandas library itself. The pandas library provides powerful and easy-to-use tools for data analysis and manipulation. The library can be imported in Python using the following line of code:

In [40]:
import pandas as pd

Here, we are importing the pandas library and using "pd" as an alias, which is a common convention in the Python community.