### 101 Pandas Exercises for Data Analysis
https://www.machinelearningplus.com/python/101-pandas-exercises-python/

In [1]:
import numpy as np
import pandas as pd

### 21. How to convert a series of date-strings to a timeseries?

In [2]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

In [3]:
# Solution 1
from dateutil.parser import parse
ser.map(lambda x: parse(x))

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [4]:
# Solution 2
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

### 22. Get the day of month, week number, day of year and day of week from ser.

In [5]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

In [6]:
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))

# day of month
print("Date: ", ser_ts.dt.day.tolist())

# week number
print("Week number: ", ser_ts.dt.weekofyear.tolist())

# day of year
print("Day number of year: ", ser_ts.dt.dayofyear.tolist())

# day of week
print("Day of week: ", ser_ts.dt.weekday_name.tolist())

Date:  [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
Day number of year:  [1, 33, 63, 94, 125, 157]
Day of week:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']


### 23. Change ser to dates that start with 4th of the respective months.

In [7]:
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

In [9]:
# Solution 1
from dateutil.parser import parse
# Parse the date
ser_ts = ser.map(lambda x: parse(x))

# Construct date string with date as 4
ser_datestr = ser_ts.dt.year.astype('str') + '-' + ser_ts.dt.month.astype('str') + '-' + '04'

# Format it.
[parse(i).strftime('%Y-%m-%d') for i in ser_datestr]

['2010-01-04', '2011-02-04', '2012-03-04']

In [10]:
ser.map(lambda x: parse('04 ' + x))

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

### 24. From ser, extract words that contain at least 2 vowels.

In [12]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

In [16]:
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) for i in list('aeiou')]) >= 2)
ser[mask]

0     Apple
1    Orange
4     Money
dtype: object

### 25. Extract the valid emails from the series emails. The regex pattern for valid emails is provided as reference.

In [17]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])

In [18]:
# Solution 1 (as series of strings)
import re
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
mask = emails.map(lambda x: bool(re.match(pattern, x)))
emails[mask]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

In [19]:
# Solution 2 (as series of list)
emails.str.findall(pattern, flags=re.IGNORECASE)

0                     []
1    [rameses@egypt.com]
2            [matt@t.co]
3    [narendra@modi.com]
dtype: object

In [23]:
# Solution 3 (as list)
# x is a list
[x[0] for x in [re.findall(pattern, email) for email in emails] if len(x) > 0]

['rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']

### 26. Compute the mean of weights of each fruit.

In [28]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))

In [29]:
fruit

0    banana
1    carrot
2    banana
3    banana
4    banana
5    banana
6    carrot
7     apple
8    carrot
9     apple
dtype: object

In [30]:
weights

0     1.0
1     2.0
2     3.0
3     4.0
4     5.0
5     6.0
6     7.0
7     8.0
8     9.0
9    10.0
dtype: float64

In [31]:
# Solution
weights.groupby(fruit).mean()

apple     9.0
banana    3.8
carrot    6.0
dtype: float64

### 27. Compute the euclidean distance between series (points) p and q, without using a packaged formula.

In [33]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

In [34]:
sum((p - q)**2)**.5

18.16590212458495

In [35]:
np.linalg.norm(p-q)

18.16590212458495

### 28. Get the positions of peaks (values surrounded by smaller values on both sides) in ser.

In [36]:
#???
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

In [46]:
np.sign(np.diff(ser))

array([ 1, -1,  1, -1,  1, -1, -1,  1, -1])

In [47]:
np.diff(np.sign(np.diff(ser)))

array([-2,  2, -2,  2, -2,  0,  2, -2])

In [37]:
dd = np.diff(np.sign(np.diff(ser)))
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

array([1, 5, 7], dtype=int64)

### 29. Replace the spaces in my_str with the least frequent character.

In [43]:
my_str = 'dbc deb abed gade'

In [44]:
ser = pd.Series(list('dbc deb abed gade'))
freq = ser.value_counts()
print(freq)
least_freq = freq.dropna().index[-1]
"".join(ser.replace(' ', least_freq))

d    4
b    3
     3
e    3
a    2
c    1
g    1
dtype: int64
d    4
b    3
     3
e    3
a    2
c    1
g    1
dtype: int64


'dbcgdebgabedggade'

### 30. How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values?

In [45]:
ser = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
ser

2000-01-01    2
2000-01-08    3
2000-01-15    2
2000-01-22    6
2000-01-29    4
2000-02-05    7
2000-02-12    5
2000-02-19    1
2000-02-26    9
2000-03-04    8
Freq: W-SAT, dtype: int32