In [2]:
import numpy as np

#### 1. How to import pandas and check the version?

In [3]:
import pandas as pd
print(pd.__version__)

2.0.3


#### 2. Create a pandas series from each of the items below: a list, numpy and a dictionary

Input

```
import numpy as np
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
```

In [4]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

# Create pandas Series
series_from_list = pd.Series(mylist)
series_from_numpy = pd.Series(myarr)
series_from_dict = pd.Series(mydict)

# Print the Series
print("Series from List:\n", series_from_list)
print("\nSeries from NumPy array:\n", series_from_numpy)
print("\nSeries from Dictionary:\n", series_from_dict)


Series from List:
 0     a
1     b
2     c
3     e
4     d
5     f
6     g
7     h
8     i
9     j
10    k
11    l
12    m
13    n
14    o
15    p
16    q
17    r
18    s
19    t
20    u
21    v
22    w
23    x
24    y
25    z
dtype: object

Series from NumPy array:
 0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
20    20
21    21
22    22
23    23
24    24
25    25
dtype: int32

Series from Dictionary:
 a     0
b     1
c     2
e     3
d     4
f     5
g     6
h     7
i     8
j     9
k    10
l    11
m    12
n    13
o    14
p    15
q    16
r    17
s    18
t    19
u    20
v    21
w    22
x    23
y    24
z    25
dtype: int32


#### 3. Convert the series ser into a dataframe with its index as another column on the dataframe.

Input

```
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)
```

In [5]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# Convert Series to DataFrame with index as another column
df = ser.reset_index()

# Rename the columns
df.columns = ['index', 'value']

# Print the DataFrame
print(df)

   index  value
0      a      0
1      b      1
2      c      2
3      e      3
4      d      4
5      f      5
6      g      6
7      h      7
8      i      8
9      j      9
10     k     10
11     l     11
12     m     12
13     n     13
14     o     14
15     p     15
16     q     16
17     r     17
18     s     18
19     t     19
20     u     20
21     v     21
22     w     22
23     x     23
24     y     24
25     z     25


#### 4. Combine ser1 and ser2 to form a dataframe.

Input:

```
import numpy as np
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))
```

In [6]:

ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

# Combine ser1 and ser2 into a DataFrame
df = pd.DataFrame({'col1': ser1, 'col2': ser2})

# Print the DataFrame
print(df)

   col1  col2
0     a     0
1     b     1
2     c     2
3     e     3
4     d     4
5     f     5
6     g     6
7     h     7
8     i     8
9     j     9
10    k    10
11    l    11
12    m    12
13    n    13
14    o    14
15    p    15
16    q    16
17    r    17
18    s    18
19    t    19
20    u    20
21    v    21
22    w    22
23    x    23
24    y    24
25    z    25


####  5. Get all items of ser1 and ser2 not common to both.

Input

```
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])
```

In [7]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Get items not common to both
not_common_items = pd.concat([ser1[~ser1.isin(ser2)], ser2[~ser2.isin(ser1)]])

# Print the result
print(not_common_items)

0    1
1    2
2    3
2    6
3    7
4    8
dtype: int64


#### 6. Compute the minimum, 25th percentile, median, 75th, and maximum of ser.

Input

```
ser = pd.Series(np.random.normal(10, 5, 25))

```

In [8]:
ser = pd.Series(np.random.normal(10, 5, 25))

# Compute summary statistics
summary_stats = ser.describe(percentiles=[.25, .5, .75])

# Extract specific percentiles
min_value = summary_stats['min']
percentile_25 = summary_stats['25%']
median = summary_stats['50%']
percentile_75 = summary_stats['75%']
max_value = summary_stats['max']

# Print the results
print(f"Minimum: {min_value}")
print(f"25th Percentile: {percentile_25}")
print(f"Median: {median}")
print(f"75th Percentile: {percentile_75}")
print(f"Maximum: {max_value}")

Minimum: -1.07402863919752
25th Percentile: 5.922990583050199
Median: 9.53526643157061
75th Percentile: 12.997017729326785
Maximum: 24.99723237038814


#### Problem 7. Calculte the frequency counts of each unique value ser.

Input

```
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
```

In [9]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

# Calculate frequency counts
value_counts = ser.value_counts()

# Print the result
print(value_counts)

c    5
f    5
b    4
d    4
h    4
g    3
a    3
e    2
Name: count, dtype: int64


#### 8. From ser, keep the top 2 most frequent items as it is and replace everything else as ‘Other’.

Input

```

np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))
```

In [10]:
np.random.seed(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

# Find the top 2 most frequent items
top_items = ser.value_counts().nlargest(2).index

# Replace everything else as 'Other'
ser = ser.apply(lambda x: x if x in top_items else 'Other')

# Print the result
print(ser)

0         1
1         1
2         4
3         4
4         4
5         4
6         1
7     Other
8     Other
9         1
10    Other
11    Other
dtype: object


#### 9. Bin the series ser into 10 equal deciles and replace the values with the bin name.

Input
```
ser = pd.Series(np.random.random(20))
```
Desired Output

```
# First 5 items
0    7th
1    9th
2    7th
3    3rd
4    8th
dtype: category
Categories (10, object): [1st < 2nd < 3rd < 4th ... 7th < 8th < 9th < 10th]
```

In [11]:
np.random.seed(42)
ser = pd.Series(np.random.random(20))

# Bin the series into 10 equal deciles
bins = pd.qcut(ser, q=10, labels=[f'{i}th' for i in range(1, 11)])

# Print the result for the first 5 items
print(bins.head())

0     5th
1    10th
2     8th
3     7th
4     2th
dtype: category
Categories (10, object): ['1th' < '2th' < '3th' < '4th' ... '7th' < '8th' < '9th' < '10th']


#### 10. Reshape the series ser into a dataframe with 7 rows and 5 columns

Input

```
ser = pd.Series(np.random.randint(1, 10, 35))
```

In [12]:
np.random.seed(42)
ser = pd.Series(np.random.randint(1, 10, 35))

# Reshape the series into a DataFrame with 7 rows and 5 columns
df = pd.DataFrame(ser.values.reshape(7, 5))

# Print the result
print(df)

   0  1  2  3  4
0  7  4  8  5  7
1  3  7  8  5  4
2  8  8  3  6  5
3  2  8  6  2  5
4  1  6  9  1  3
5  7  4  9  3  5
6  3  7  5  9  7


#### 11. Find the positions of numbers that are multiples of 3 from ser.

Input

```
ser = pd.Series(np.random.randint(1, 10, 7))
```

In [13]:
# Creating the series
ser = pd.Series(np.random.randint(1, 10, 7))

# Finding positions of multiples of 3
positions = ser.index[ser % 3 == 0].tolist()

print("Original Series:")
print(ser)
print("\nPositions of multiples of 3:")
print(positions)

Original Series:
0    2
1    4
2    9
3    2
4    9
5    5
6    2
dtype: int32

Positions of multiples of 3:
[2, 4]


#### 12. Stack ser1 and ser2 vertically and horizontally (to form a dataframe).

Input

```
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))
```

In [14]:
# Input Series
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# Vertical stacking (concatenation along the rows)
vertical_stack = pd.concat([ser1, ser2], axis=0, ignore_index=True)

# Horizontal stacking (concatenation along the columns)
horizontal_stack = pd.concat([ser1, ser2], axis=1, keys=['ser1', 'ser2'])

# Display the results
print("Vertical Stack:")
print(vertical_stack)

print("\nHorizontal Stack:")
print(horizontal_stack)

Vertical Stack:
0    0
1    1
2    2
3    3
4    4
5    a
6    b
7    c
8    d
9    e
dtype: object

Horizontal Stack:
   ser1 ser2
0     0    a
1     1    b
2     2    c
3     3    d
4     4    e


#### 13. Stack ser1 and ser2 vertically and horizontally (to form a dataframe).

Input
```
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))
```

In [15]:

# Input Series
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# Vertical stacking (concatenation along the rows)
vertical_stack = pd.concat([ser1, ser2], axis=0, ignore_index=True)

# Horizontal stacking (concatenation along the columns)
horizontal_stack = pd.concat([ser1, ser2], axis=1, keys=['ser1', 'ser2'])

# Display the results
print("Vertical Stack:")
print(vertical_stack)

print("\nHorizontal Stack:")
print(horizontal_stack)

Vertical Stack:
0    0
1    1
2    2
3    3
4    4
5    a
6    b
7    c
8    d
9    e
dtype: object

Horizontal Stack:
   ser1 ser2
0     0    a
1     1    b
2     2    c
3     3    d
4     4    e


#### 14. Get the day of month, week number, day of year and day of week from ser.

Input
```
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
```
Desired output
```
Date:  [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
Day num of year:  [1, 33, 63, 94, 125, 157]
Day of week:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']
```

In [16]:
# Input Series
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# Convert strings to datetime objects
ser_date = pd.to_datetime(ser)

# Extract information
day_of_month = ser_date.dt.day.tolist()
week_number = ser_date.dt.week.tolist()  # Note: This is deprecated in recent versions, use weekofyear instead
day_of_year = ser_date.dt.dayofyear.tolist()
day_of_week = ser_date.dt.day_name().tolist()

# Display the results
print("Date:", day_of_month)
print("Week number:", week_number)
print("Day num of year:", day_of_year)
print("Day of week:", day_of_week)

ValueError: time data "02-02-2011" doesn't match format "%d %b %Y", at position 1. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

#### 15. From ser, extract words that contain atleast 2 vowels.

Input
```
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])
```

Desired Output
```
0     Apple
1    Orange
4     Money
dtype: object
```

In [None]:
# Input Series
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

# Function to count vowels in a word
def count_vowels(word):
    vowels = set("AEIOUaeiou")
    return sum(1 for char in word if char in vowels)

# Filter words with at least 2 vowels
result = ser[ser.apply(lambda x: count_vowels(x) >= 2)]

# Display the result
print(result)

0     Apple
1    Orange
4     Money
dtype: object


#### 16. Get the first column (a) in df as a dataframe (rather than as a Series).

Input

```
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
```

In [17]:
# Create the DataFrame
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))

# Extract the first column 'a' as a DataFrame
first_column_df = df[['a']]

# Print the result
print(first_column_df)

    a
0   0
1   5
2  10
3  15


#### 17.

1 - In df, interchange columns 'a' and 'c'.

2 - Create a generic function to interchange two columns, without hardcoding column names.

3 - Sort the columns in reverse alphabetical order, that is colume 'e' first through column 'a' last.

Input

```

df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
```

#### 

In [18]:
def interchange_columns(df, col1, col2):
    # Interchange two columns in a DataFrame
    df[[col1, col2]] = df[[col2, col1]]
    return df

# Example usage
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
df = interchange_columns(df, 'a', 'c')
print(df)

    a   b   c   d   e
0   2   1   0   3   4
1   7   6   5   8   9
2  12  11  10  13  14
3  17  16  15  18  19


#### 18. Find the row position of the 5th largest value of column 'a' in df.

Input

```
df = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10,-1), columns=list('abc'))
```

In [19]:
# Create the DataFrame
df = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10, -1), columns=list('abc'))

# Find the row position of the 5th largest value in column 'a'
row_position_5th_largest = df['a'].nlargest(5).idxmax()

print("Row position of the 5th largest value in column 'a': {}".format(row_position_5th_largest))

Row position of the 5th largest value in column 'a': 2


#### 19. In ser, find the position of the 2nd largest value greater than the mean.

Input
```
ser = pd.Series(np.random.randint(1, 100, 15))
```

In [20]:
# Create the Series
ser = pd.Series(np.random.randint(1, 100, 15))

# Find the position of the 2nd largest value greater than the mean
mean_value = ser.mean()
filtered_values = ser[ser > mean_value].nlargest(2)
position_2nd_largest_gt_mean = filtered_values.idxmax()

print("Position of the 2nd largest value greater than the mean: {}".format(position_2nd_largest_gt_mean))

Position of the 2nd largest value greater than the mean: 0


#### 20. Get one-hot encodings for column 'a' in the dataframe df and append it as columns.

Input

```
df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))
    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24
```
Output

```
   0  5  10  15  20   b   c   d   e
0  1  0   0   0   0   1   2   3   4
1  0  1   0   0   0   6   7   8   9
2  0  0   1   0   0  11  12  13  14
3  0  0   0   1   0  16  17  18  19
4  0  0   0   0   1  21  22  23  24
```

In [21]:

# Create the DataFrame
df = pd.DataFrame(np.arange(25).reshape(5, -1), columns=list('abcde'))

# Get one-hot encodings for column 'a'
one_hot_encoding = pd.get_dummies(df['a'], prefix='a')

# Concatenate the one-hot encoding with the original DataFrame
df = pd.concat([one_hot_encoding, df.drop('a', axis=1)], axis=1)

# Reorder columns to have one-hot encoding first
column_order = list(one_hot_encoding.columns) + list(df.columns.difference(one_hot_encoding.columns))
df = df[column_order]

# Print the result
print(df)

     a_0    a_5   a_10   a_15   a_20   b   c   d   e
0   True  False  False  False  False   1   2   3   4
1  False   True  False  False  False   6   7   8   9
2  False  False   True  False  False  11  12  13  14
3  False  False  False   True  False  16  17  18  19
4  False  False  False  False   True  21  22  23  24
