#### 1. Import pandas and check the version

In [1]:
import numpy as np
import pandas as pd
pd.__version__

'1.4.2'

#### 2. Create a pandas series from each of the items below: a list, numpy and a dictionary

In [2]:
a = np.array([1,2,3])
pd.Series(a,index=["a","b","c"])

a    1
b    2
c    3
dtype: int32

In [3]:
a = ['a','b','c']
pd.Series(a,index=[1,2,3])

1    a
2    b
3    c
dtype: object

In [4]:
a = [{"name":"Amir","age":19},{"name":"Khagani", "age":20}]
a
pd.Series(a,index=["man1","man2"])

man1       {'name': 'Amir', 'age': 19}
man2    {'name': 'Khagani', 'age': 20}
dtype: object

#### 3. Convert the series ser into a dataframe with its index as another column on the dataframe

In [5]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)
pd.DataFrame(ser).T

Unnamed: 0,a,b,c,e,d,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z
0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25


#### 4. Combine ser1 and ser2 to form a dataframe

In [6]:
import numpy as np
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))
pd.DataFrame(ser1,index=ser2).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,a,b,c,e,d,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z


#### 5. Give a name to the series ser calling it ‘alphabets’

In [7]:
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'),name='alphabets')
ser

0     a
1     b
2     c
3     e
4     d
5     f
6     g
7     h
8     i
9     j
10    k
11    l
12    m
13    n
14    o
15    p
16    q
17    r
18    s
19    t
20    u
21    v
22    w
23    x
24    y
25    z
Name: alphabets, dtype: object

#### 6. From ser1 remove items present in ser2

In [8]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])
a = pd.Series(list(set(ser1).intersection(set(ser2))))
res = ser1[~np.isin(ser1, ser2)]
print(list(res))

[1, 2, 3]


#### 7. Get all items of ser1 and ser2 not common to both

In [9]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])
union = np.union1d(ser1,ser2)
intersection = np.intersect1d(ser1,ser2)
subtract = np.isin(union,intersection)
res = union[~subtract]
res

array([1, 2, 3, 6, 7, 8], dtype=int64)

#### 8. Compute the minimum, 25th percentile, median, 75th, and maximum of ser

In [10]:
ser = pd.Series(np.random.normal(10, 5, 25))
print(ser.min(),np.percentile(ser, 25),np.median(ser),np.percentile(ser, 75),ser.max(),sep="\n")

-4.144092594500041
6.131894593971666
9.241186866135044
12.02974611813686
17.346762287764868


#### 9. Calculte the frequency counts of each unique value ser

In [11]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
counts = ser.value_counts().to_dict()
counts

{'c': 7, 'e': 6, 'a': 4, 'g': 4, 'b': 3, 'f': 3, 'd': 2, 'h': 1}

#### 10. From ser, keep the top 2 most frequent items as it is and replace everything else as ‘Other’

In [12]:
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
ser

0         3
1         1
2         1
3         1
4         1
5         1
6         3
7         3
8     Other
9         3
10    Other
11        3
dtype: object

#### 11. Bin the series ser into 10 equal deciles and replace the values with the bin name

In [13]:
ser = pd.Series(np.random.random(20))
pd.qcut(ser, q=10)

0                    (0.289, 0.331]
1                    (0.626, 0.857]
2                    (0.586, 0.626]
3                    (0.586, 0.626]
4                    (0.163, 0.258]
5     (0.013600000000000001, 0.163]
6                    (0.466, 0.543]
7                    (0.163, 0.258]
8                    (0.857, 0.919]
9                    (0.331, 0.466]
10    (0.013600000000000001, 0.163]
11                   (0.543, 0.586]
12                   (0.258, 0.289]
13                   (0.258, 0.289]
14                   (0.466, 0.543]
15                   (0.289, 0.331]
16                   (0.543, 0.586]
17                   (0.331, 0.466]
18                   (0.857, 0.919]
19                   (0.626, 0.857]
dtype: category
Categories (10, interval[float64, right]): [(0.013600000000000001, 0.163] < (0.163, 0.258] < (0.258, 0.289] < (0.289, 0.331] ... (0.543, 0.586] < (0.586, 0.626] < (0.626, 0.857] < (0.857, 0.919]]

#### 12. Reshape the series ser into a dataframe with 7 rows and 5 columns

In [14]:
ser = pd.Series(np.random.randint(1, 10, 35))
pd.DataFrame(ser.values.reshape(7,5))

Unnamed: 0,0,1,2,3,4
0,5,5,9,9,2
1,1,5,1,7,6
2,5,5,9,5,8
3,8,6,6,7,8
4,1,1,7,6,6
5,8,9,5,4,5
6,8,3,9,5,8


#### 13. Find the positions of numbers that are multiples of 3 from ser

In [15]:
# 1st way
ser = pd.Series(np.random.randint(1, 10, 7))
print(ser)
[i  for i in ser.index if ser[i]%3==0]

0    4
1    7
2    4
3    3
4    2
5    1
6    7
dtype: int32


[3]

In [16]:
# 2nd way
ser = pd.Series(np.random.randint(1, 10, 7))
print(ser)
np.where(ser%3==0)

0    6
1    7
2    1
3    5
4    8
5    6
6    9
dtype: int32


(array([0, 5, 6], dtype=int64),)

#### 14. From ser, extract the items at positions in list pos

In [17]:
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]
# 1st way ser.take(pos)
np.take(ser,pos) # 2nd way

0     a
4     e
8     i
14    o
20    u
dtype: object

#### 15. Stack ser1 and ser2 vertically and horizontally (to form a dataframe)

In [18]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))
# print(ser1,ser2,sep='\n')
# print(pd.DataFrame([ser1,ser2],columns=ser1),pd.DataFrame([ser1,ser2],columns=ser1).T,sep='\n')
print(pd.DataFrame(ser2, ser1),pd.DataFrame(ser2,ser1).T,sep='\n')

   0
0  a
1  b
2  c
3  d
4  e
   0  1  2  3  4
0  a  b  c  d  e


#### 16. Get the positions of items of ser2 in ser1 as a list

In [19]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])
[pd.Index(ser1).get_loc(i) for i in ser2]

[5, 4, 0, 8]

#### 17. Compute the mean squared error of truth and pred series

In [20]:
from sklearn.metrics import mean_squared_error
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)
mean_squared_error(pred, truth)

0.5507420866719686

#### 18. Change the first character of each word to upper case in each word of ser

In [21]:
ser = pd.Series(['how', 'to', 'knock', 'down?'])
ser.str.capitalize()

0      How
1       To
2    Knock
3    Down?
dtype: object

#### 19. Calculate the number of characters in each word in a series

In [22]:
ser = pd.Series(['how', 'to', 'knock', 'down?'])

#### 20. Compute difference of differences between consecutive numbers of ser

In [23]:
# 1st way
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])
def differences(array):
    return pd.Series([array[i]-array[i-1] for i in array.index if(i>=1)])
differences(differences(ser))   

0    1
1    1
2    1
3    1
4    0
5    2
dtype: int64

In [24]:
# 2nd way
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])
ser.diff().diff().dropna()

2    1.0
3    1.0
4    1.0
5    1.0
6    0.0
7    2.0
dtype: float64

#### 21. Convert a series of date-strings to a timeseries

In [25]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

#### 22. Get the day of month, week number, day of year and day of week from ser

In [26]:
from dateutil.parser import parse
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
ser = ser.map(lambda x: parse(x))
# ser.dt.day
# ser.dt.dayofyear
# ser.dt.isocalendar().week
ser.dt.day_name()

0       Friday
1    Wednesday
2     Saturday
3     Thursday
4       Monday
5     Saturday
dtype: object

#### 23. Change ser to dates that start with 4th of the respective months

In [27]:
import datetime
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
times = pd.Index(pd.to_datetime(ser))
times += datetime.timedelta(days=3)
times

DatetimeIndex(['2010-01-04', '2011-02-04', '2012-03-04'], dtype='datetime64[ns]', freq=None)

#### 24. From ser, extract words that contain atleast 2 vowels

In [28]:
from collections import Counter 
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) for i in list('aeiou')]) >= 2)
ser[mask]

0     Apple
1    Orange
4     Money
dtype: object

#### 25. Extract the valid emails from the series emails. The regex pattern for valid emails is provided as reference

In [29]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
emails[emails.str.match(pattern)]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

#### 26. Compute the mean of weights of each fruit

In [30]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
df = pd.DataFrame([pd.Index(weights)],columns=fruit)
df.groupby(df.columns, axis=1).mean()

Unnamed: 0,apple,banana,carrot
0,8.5,4.714286,5.0


#### 27. Compute the euclidean distance between series (points) p and q, without using a packaged formula

In [31]:
# 1st way
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])
np.sqrt((((np.array(q)-np.array(p))**2).sum()))

18.16590212458495

In [32]:
# 2nd way
dist = np.sqrt(np.sum([(a-b)*(a-b) for a, b in zip(p, q)]))
dist

18.16590212458495

In [33]:
# 3rd way
p1 = np.sum([(a * a) for a in p])
p2 = np.sum([(b * b) for b in q])
p3 = -1 * np.sum([(2 * a*b) for (a, b) in zip(p, q)])
dist = np.sqrt(np.sum(p1 + p2 + p3))
dist

18.16590212458495

#### 28. Get the positions of peaks (values surrounded by smaller values on both sides) in ser

In [34]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])
dd = np.diff(np.sign(np.diff(ser)))
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

array([1, 5, 7], dtype=int64)

#### 29. Replace the spaces in my_str with the least frequent character

In [35]:
my_str = 'dbc deb abed gade'
ser = pd.Series(list('dbc deb abed gade'))
freq = ser.value_counts()
least_freq = freq.dropna().index[-1]
"".join(ser.replace(' ', least_freq))

'dbcgdebgabedggade'

#### 30. Create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values

In [36]:
ser = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
ser

2000-01-01    9
2000-01-08    2
2000-01-15    8
2000-01-22    9
2000-01-29    9
2000-02-05    2
2000-02-12    7
2000-02-19    3
2000-02-26    9
2000-03-04    6
Freq: W-SAT, dtype: int32