# 1. How to import pandas and check the version?

In [1]:
import pandas as pd
import numpy as np


In [2]:
pd.show_versions(as_json=True)

{'system': {'commit': None, 'python': '3.7.3.final.0', 'python-bits': 64, 'OS': 'Darwin', 'OS-release': '18.2.0', 'machine': 'x86_64', 'processor': 'i386', 'byteorder': 'little', 'LC_ALL': 'None', 'LANG': 'en_GB.UTF-8', 'LOCALE': 'en_GB.UTF-8'}, 'dependencies': {'pandas': '0.24.2', 'pytest': '4.6.2', 'pip': '19.1.1', 'setuptools': '41.0.1', 'Cython': '0.29.10', 'numpy': '1.16.4', 'scipy': '1.2.1', 'pyarrow': None, 'xarray': None, 'IPython': '7.5.0', 'sphinx': '2.1.0', 'patsy': '0.5.1', 'dateutil': '2.8.0', 'pytz': '2019.1', 'blosc': None, 'bottleneck': '1.2.1', 'tables': '3.5.2', 'numexpr': '2.6.9', 'feather': None, 'matplotlib': '3.1.0', 'openpyxl': '2.6.2', 'xlrd': '1.2.0', 'xlwt': '1.3.0', 'xlsxwriter': '1.1.8', 'lxml.etree': '4.3.3', 'bs4': '4.7.1', 'html5lib': '1.0.1', 'sqlalchemy': '1.3.4', 'pymysql': None, 'psycopg2': None, 'jinja2': '2.10.1', 's3fs': None, 'fastparquet': None, 'pandas_gbq': None, 'pandas_datareader': '0.7.0', 'gcsfs': None}}


# How to create a series from a list, numpy array and dict?


In [3]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

In [4]:
ser_mylist = pd.Series(mylist)
ser_myarr = pd.Series(myarr)
ser_mydict = pd.Series(mydict)
ser_mydict.head(3)

a    0
b    1
c    2
dtype: int64

# How to convert the index of a series into a column?

### Convert the series ser into a dataframe with its index as another column on the dataframe.

In [5]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

In [6]:
ser.head()

a    0
b    1
c    2
e    3
d    4
dtype: int64

In [7]:
# Convert series to dataframe and then reset the index by chaining methods
df = ser.to_frame().reset_index()
df.head()

Unnamed: 0,index,0
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


# How to combine many series to form a dataframe?

In [8]:
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

# Pass the two series as a dictionary with column names as keys
df = pd.DataFrame({"Series 1": ser1, "Series 2" : ser2})
df.head(3)

Unnamed: 0,Series 1,Series 2
0,a,0
1,b,1
2,c,2


# 5. How to assign name to the series’ index?

Give a name to the series ser calling it ‘alphabets’.

In [9]:
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

In [10]:
ser.name = "alphabets"
ser.name

'alphabets'

# 6. How to get the items of series A not present in series B?

From ser1 remove items present in ser2.


In [11]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

In [12]:
# ~ser1.isin(ser2) works as a mask / boolean. ~ means it will only retrieve values that return false, i.e not present
# when using the .isin() method
ser1 = ser1[~ser1.isin(ser2)]
ser1

0    1
1    2
2    3
dtype: int64

# 7. How to get the items not common to both series A and series B?
### Difficulty Level: L2

Get all items of ser1 and ser2 not common to both.

In [314]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

In [315]:
# Create a third series that removes all values that are not present in series two
ser3 = ser1[~ser1.isin(ser2)]
ser3

0    1
1    2
2    3
dtype: int64

In [316]:
# Create a fourth series that removes all values for ser 2 that are not present in series 1
ser4 = ser2[~ser2.isin(ser1)]
ser4

2    6
3    7
4    8
dtype: int64

In [317]:
# Append the lists together to get all the uncommon values from each list
ser3 = ser3.append(ser4)
ser3.values

array([1, 2, 3, 6, 7, 8])

In [17]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

In [18]:
# Alternative solution

# Join the two series in 1 dimension
ser_u = pd.Series(np.union1d(ser1, ser2))  # union

# Find the values that are present i.e intersect in both series and save them to a third series
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect

# Filter original series removing values that are intersecting values
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

# 8. How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

Difficuty Level: L2

Compute the minimum, 25th percentile, median, 75th, and maximum of ser.

In [19]:
# Many different methods available for this
ser = pd.Series(np.random.normal(10, 5, 25))

In [20]:
ser.describe()

count    25.000000
mean     10.674539
std       4.568551
min       2.712828
25%       8.231374
50%      10.933799
75%      12.198995
max      21.389436
dtype: float64

In [21]:
ser.min()

2.7128279956912573

In [22]:
ser.quantile(0.25)

8.23137382217076

In [23]:
ser.quantile(0.75)

12.198994501925558

In [24]:
ser.max()

21.389435609367894

In [25]:
ser.quantile(0.5)

10.933798925904615

In [26]:
np.percentile(ser, q=[0, 25, 50, 75, 100])

array([ 2.712828  ,  8.23137382, 10.93379893, 12.1989945 , 21.38943561])

# 9. How to get frequency counts of unique items of a series?
Difficulty Level: L1

Calculte the frequency counts of each unique value ser.



In [318]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
ser.head()

0    e
1    b
2    f
3    h
4    a
dtype: object

In [28]:
# use .value_counts() method to total each unique value
ser.value_counts()

b    6
g    5
h    5
e    5
d    3
a    3
c    2
f    1
dtype: int64

# 10. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?
Difficulty Level: L2

From ser, keep the top 2 most frequent items as it is and replace everything else as ‘Other’.



In [323]:
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

In [324]:
ser.head(3)

0    2
1    3
2    3
dtype: int64

In [325]:
ser.value_counts()

4    4
3    4
2    4
dtype: int64

In [326]:
# Slice the two top values
ser.value_counts().index[:2]

Int64Index([4, 3], dtype='int64')

In [327]:
# Filter series by two most occuring valeus
ser[ser.value_counts().index[:2]]

4    4
3    3
dtype: int64

In [328]:
# Change values that are not present in the slice to "Other"
ser[~ser.isin(ser.value_counts().index[:2])] = "Other"

In [329]:
ser.head()

0    Other
1        3
2        3
3        3
4        4
dtype: object

# 11. How to bin a numeric series to 10 groups of equal size?
Difficulty Level: L2

Bin the series ser into 10 equal deciles and replace the values with the bin name.

In [36]:
ser = pd.Series(np.random.random(20))

In [37]:
ser.head()

0    0.112834
1    0.771923
2    0.105996
3    0.328869
4    0.163773
dtype: float64

In [38]:
# use q.cut method to cut series into deciles, then provide labels for each one
pd.qcut(ser, q=[0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1],
       labels = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']).head()

0    3rd
1    8th
2    2nd
3    4th
4    4th
dtype: category
Categories (10, object): [1st < 2nd < 3rd < 4th ... 7th < 8th < 9th < 10th]

# 12. How to convert a numpy array to a dataframe of given shape? (L1)
Difficulty Level: L1

Reshape the series ser into a dataframe with 7 rows and 5 columns

In [39]:
ser = pd.Series(np.random.randint(1, 10, 35))

In [40]:
ser.values

array([3, 2, 7, 9, 3, 2, 9, 8, 1, 4, 5, 6, 4, 7, 2, 4, 8, 3, 7, 4, 2, 8,
       4, 4, 7, 3, 3, 5, 2, 3, 3, 7, 6, 7, 7])

In [41]:
# Produce series values to an array by accessing .values attribute.
# Then use .reshape() method supplying number of rows by number columns
df = pd.DataFrame(ser.values.reshape(7,5))

print(df)

   0  1  2  3  4
0  3  2  7  9  3
1  2  9  8  1  4
2  5  6  4  7  2
3  4  8  3  7  4
4  2  8  4  4  7
5  3  3  5  2  3
6  3  7  6  7  7


# 13. How to find the positions of numbers that are multiples of 3 from a series?
Difficulty Level: L2

Find the positions of numbers that are multiples of 3 from ser.



In [330]:
ser = pd.Series(np.random.randint(1, 10, 7))

In [331]:
ser

0    9
1    2
2    4
3    4
4    2
5    1
6    8
dtype: int64

In [336]:
# Create a boolean with modulo and filter series
ser[ser % 3 == 0]

0    9
dtype: int64

In [338]:
# Extract the exact number by accessing values attribute and then supply the key
ser[ser % 3 == 0].values[0]

9

# 14. How to extract items at given positions from a series
Difficulty Level: L1

From ser, extract the items at positions in list pos.

In [341]:
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

In [342]:
# Both methods work, values attribute can also be accessed
ser.get(pos)

0     a
4     e
8     i
14    o
20    u
dtype: object

In [343]:
ser.take(pos).values

array(['a', 'e', 'i', 'o', 'u'], dtype=object)

# 15. How to stack two series vertically and horizontally ?
Difficulty Level: L1

Stack ser1 and ser2 vertically and horizontally (to form a dataframe).

In [349]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

In [350]:
# Concatenate the series and choose columns as the axis to do so i.e axis = 1
df = pd.concat([ser1, ser2], axis=1)
df

Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


# 16. How to get the positions of items of series A in another series B?
Difficulty Level: L2

Get the positions of items of ser2 in ser1 as a list.

In [53]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

In [54]:
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect

In [55]:
ser_i

0     1
1     3
2    10
3    13
dtype: int64

In [56]:
ser_i.values

array([ 1,  3, 10, 13])

In [57]:
# Filter ser1 by values that intersect both series i.e ser_i
ser1[ser1.isin(ser_i)]

0    10
4     3
5     1
8    13
dtype: int64

In [58]:
ser1[ser1.isin(ser_i)].index

Int64Index([0, 4, 5, 8], dtype='int64')

In [59]:
ser1[ser1.isin(ser_i)].index.tolist()

[0, 4, 5, 8]

In [60]:
# Alternate solutions, difficult one here
[np.where(i == ser1)[0].tolist()[0] for i in ser2]

[5, 4, 0, 8]

In [61]:
# locate the index of values in ser1 that are equal to the values in ser2
[pd.Index(ser1).get_loc(i) for i in ser2]

[5, 4, 0, 8]

# 17. How to compute the mean squared error on a truth and predicted series?
Difficulty Level: L2

Compute the mean squared error of truth and pred series.

In [62]:
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

In [351]:
truth.head()

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [352]:
pred.head()

0    0.542031
1    1.336083
2    2.532167
3    3.758965
4    4.782521
dtype: float64

In [65]:
mse = np.mean((truth - pred)**2)
mse

0.2974813350313152

# 18. How to convert the first character of each element in a series to uppercase?
Difficulty Level: L2

Change the first character of each word to upper case in each word of ser.



In [66]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

In [67]:
# Use the .str accessor to then use the .title() method to capitalize all the strings
ser.str.title()

0     How
1      To
2    Kick
3    Ass?
dtype: object

# 19. How to calculate the number of characters in each word in a series?
Difficulty Level: L2

In [68]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

In [69]:
# Use the .str accessor to then use the .len() method to measure length of all the strings
ser.str.len()

0    3
1    2
2    4
3    4
dtype: int64

# 20. How to compute difference of differences between consecutive numbers of a series?
Difficulty Level: L1

Difference of differences between the consecutive numbers of ser.

In [70]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

In [71]:
# Use numpy .diff() method to find difference between consecutive numbers
np.diff(ser)

array([2, 3, 4, 5, 6, 6, 8])

In [72]:
# Use numpy .diff() method and pass argument for n, i.e 2 is difference of initial differences
np.diff(ser, n = 2)
np.diff(ser, n = 2).tolist()

[1, 1, 1, 1, 0, 2]

# 21. How to convert a series of date-strings to a timeseries?
Difficiulty Level: L2

In [73]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

In [74]:
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

# 22. How to get the day of month, week number, day of year and day of week from a series of date strings?
Difficiulty Level: L2

Get the day of month, week number, day of year and day of week from ser.

In [75]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

In [76]:
ser = pd.to_datetime(ser)
ser

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [77]:
ser.dt.day.tolist()

[1, 2, 3, 4, 5, 6]

In [78]:
ser.dt.week.tolist()

[53, 5, 9, 14, 19, 23]

In [79]:
ser.dt.year.tolist()

[2010, 2011, 2012, 2013, 2014, 2015]

In [80]:
ser.dt.day_name().tolist()

['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']

# 23. How to convert year-month string to dates corresponding to the 4th day of the month?
Difficiulty Level: L2

Change ser to dates that start with 4th of the respective months.

In [81]:
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

In [82]:
pd.to_datetime(ser)

0   2010-01-01
1   2011-02-01
2   2012-03-01
dtype: datetime64[ns]

In [83]:
ser = pd.to_datetime(ser)

In [84]:
# Use anonymous lambda function and dt accessor to replace the day with 4
ser = ser.apply(lambda dt: dt.replace(day=4))

In [85]:
ser

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

# 24. How to filter words that contain at least 2 vowels from a series?
Difficiulty Level: L3

From ser, extract words that contain atleast 2 vowels.

In [86]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

In [87]:
# import Counter function
from collections import Counter
# Use .map() method
# Create a filter. Convert all characters in each item in ser to lowercase.
# Counter creates a dictionary of all values and how many vowels they contain, and this is then totalled with sum()
# Create a boolean that checks if the sum is greater than or equal to 2
# Apply this filter to ser

mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) for i in list('aeiou')]) >= 2)
ser[mask]

0     Apple
1    Orange
4     Money
dtype: object

# 25. How to filter valid emails from a series?
Difficiulty Level: L3

Extract the valid emails from the series emails. The regex pattern for valid emails is provided as reference.

In [88]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

In [89]:
df = pd.DataFrame(emails, columns = ["Emails"])

In [90]:
df

Unnamed: 0,Emails
0,buying books at amazom.com
1,rameses@egypt.com
2,matt@t.co
3,narendra@modi.com


In [91]:
df["Emails"].str.contains(pattern)

0    False
1     True
2     True
3     True
Name: Emails, dtype: bool

In [92]:
mask = df["Emails"].str.contains(pattern)

In [93]:
df[mask]

Unnamed: 0,Emails
1,rameses@egypt.com
2,matt@t.co
3,narendra@modi.com


In [94]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

In [95]:
emails.str.contains(pattern)

0    False
1     True
2     True
3     True
dtype: bool

In [96]:
emails[emails.str.contains(pattern)]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

In [97]:
emails[emails.str.contains(pattern)].tolist()

['rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']

# 26. How to get the mean of a series grouped by another series?
Difficiulty Level: L2

Compute the mean of weights of each fruit.

In [98]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print(weights.tolist())
print(fruit.tolist())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['carrot', 'banana', 'carrot', 'carrot', 'banana', 'banana', 'banana', 'apple', 'banana', 'carrot']


In [99]:
data = pd.DataFrame({'Fruit': fruit,'Weights': weights})
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
Fruit      10 non-null object
Weights    10 non-null float64
dtypes: float64(1), object(1)
memory usage: 240.0+ bytes


In [100]:
data["Fruit"].astype("category")

0    carrot
1    banana
2    carrot
3    carrot
4    banana
5    banana
6    banana
7     apple
8    banana
9    carrot
Name: Fruit, dtype: category
Categories (3, object): [apple, banana, carrot]

In [101]:
data

Unnamed: 0,Fruit,Weights
0,carrot,1.0
1,banana,2.0
2,carrot,3.0
3,carrot,4.0
4,banana,5.0
5,banana,6.0
6,banana,7.0
7,apple,8.0
8,banana,9.0
9,carrot,10.0


In [102]:
totals = data.groupby("Fruit")
totals["Weights"].mean()


Fruit
apple     8.0
banana    5.8
carrot    4.5
Name: Weights, dtype: float64

In [103]:
# Other solution, quicker
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
weights.groupby(fruit).mean()

apple     4.4
banana    7.5
carrot    3.0
dtype: float64

# 27. How to compute the euclidean distance between two series?
Difficiulty Level: L2

Compute the euclidean distance between series (points) p and q, without using a packaged formula.



In [104]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

In [105]:
# Works like pythagoras but in this case it goes up to 10 dimensions. Square root it at the end
euc = (((p - q)**2).sum())** 0.5

In [106]:
euc

18.16590212458495

In [107]:
sum((p-q)**2)**0.5

18.16590212458495

# 28. How to find all the local maxima (or peaks) in a numeric series?
Difficiulty Level: L3

Get the positions of peaks (values surrounded by smaller values on both sides) in ser.



In [108]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

In [109]:
dd = np.diff(np.sign(np.diff(ser)))
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

array([1, 5, 7])

In [110]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

In [111]:
# gets the difference between each value in series
np.diff(ser)

array([ 8, -7,  1,  5,  1, -8,  5, -4])

In [112]:
# The sign function returns -1 if x < 0, 0 if x==0, 1 if x > 0. nan is returned for nan inputs.
np.sign(np.diff(ser))

array([ 1, -1,  1,  1,  1, -1,  1, -1])

In [113]:
# Finds the difference between each value again.
np.diff(np.sign(np.diff(ser)))

array([-2,  2,  0,  0, -2,  2, -2])

In [114]:
# Locate -2 and find index in array as this is an indicator of a peak
np.where(dd == -2)

(array([0, 4, 6]),)

In [115]:
# Increase index by 1 as arrays have decresed in size by one, first index cannot be a peak as it does not 
# have values both sides of it
np.where(dd == -2)[0] + 1

array([1, 5, 7])

In [116]:
# Filter series with this condition
ser[np.where(dd == -2)[0] + 1]

1    10
5    10
7     7
dtype: int64

# 29. How to replace missing spaces in a string with the least frequent character?

In [117]:
my_str = 'dbc deb abed gade'
c = Counter(my_str)
c

Counter({'d': 4, 'b': 3, 'c': 1, ' ': 3, 'e': 3, 'a': 2, 'g': 1})

In [118]:
# Both c and g are least common but as a dictionary is unordered either could be returned
least_common = c.most_common()[-1]
least_common

('g', 1)

In [119]:
least_common[0][0]

'g'

In [120]:
char = least_common[0][0]
my_str.replace(" ", char)

'dbcgdebgabedggade'

In [121]:
# Alternative solution
ser = pd.Series(list('dbc deb abed gade'))
freq = ser.value_counts()
print(freq)
least_freq = freq.dropna().index[-1]
"".join(ser.replace(' ', least_freq))

d    4
     3
e    3
b    3
a    2
g    1
c    1
dtype: int64


'dbccdebcabedcgade'

# 30. How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values?
Difficiulty Level: L2

In [122]:
# Create a series with a random array as the data, and generate the timeseries as the index
ser = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
ser

2000-01-01    3
2000-01-08    3
2000-01-15    6
2000-01-22    5
2000-01-29    5
2000-02-05    6
2000-02-12    3
2000-02-19    3
2000-02-26    2
2000-03-04    4
Freq: W-SAT, dtype: int64

# 31. How to fill an intermittent time series so all missing dates show up with values of previous non-missing date?
Difficiulty Level: L2

ser has missing dates and values. Make all missing dates appear and fill up with value from previous date.



In [123]:
ser = pd.Series([1,10,3,np.nan], index=pd.to_datetime(['2000-01-01', '2000-01-03', '2000-01-06', '2000-01-08']))
print(ser)

2000-01-01     1.0
2000-01-03    10.0
2000-01-06     3.0
2000-01-08     NaN
dtype: float64


In [124]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.resample.html
ser.resample('D').ffill()  # fill with previous value

2000-01-01     1.0
2000-01-02     1.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     NaN
Freq: D, dtype: float64

# 32. How to compute the autocorrelations of a numeric series?
Difficiulty Level: L3

Compute autocorrelations for the first 10 lags of ser. Find out which lag has the largest correlation.

In [125]:
ser = pd.Series(np.arange(20) + np.random.normal(1, 10, 20))

In [126]:
ser

0    -11.391143
1      1.736310
2     16.596568
3     17.627690
4     17.083819
5     15.538603
6      0.158542
7     17.888381
8     17.780808
9     25.735943
10    -4.021192
11     8.924403
12    10.317589
13     9.479998
14    28.763603
15     2.393728
16    10.808853
17    16.058113
18    11.221448
19    19.821017
dtype: float64

In [127]:
autocorrelations = [ser.autocorr(i).round(2) for i in range(11)]
print(autocorrelations[1:])
print('Lag having highest correlation: ', np.argmax(np.abs(autocorrelations[1:]))+1)

[-0.06, -0.18, -0.27, -0.2, 0.47, 0.16, -0.2, -0.51, -0.11, 0.63]
Lag having highest correlation:  10


# 33. How to import only every nth row from a csv file to create a dataframe?
Difficiulty Level: L2

Import every 50th row of BostonHousing dataset as a dataframe.

In [353]:
import csv          
with open('BostonHousing.csv', 'r') as f:
    reader = csv.reader(f)
    out = []
    # for index and row in function altered csv
    for i, row in enumerate(reader):
        # if the index is a multiple of 50
        if i%50 == 0:
            # append that sepcific row to the list "out"
            out.append(row)
# Use from second row appended to list as data and the first row appended as column headers
df2 = pd.DataFrame(out[1:], columns=out[0])
print(df2.head())

FileNotFoundError: [Errno 2] No such file or directory: 'BostonHousing.csv'

# 34. How to change column values when importing csv to a dataframe?
Difficulty Level: L2

Import the boston housing dataset, but while importing change the 'medv' (median house value) column so that values < 25 becomes ‘Low’ and > 25 becomes ‘High’.

In [355]:
# Solution 1: Using converter parameter
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                 converters={'medv': lambda x: 'High' if float(x) > 25 else 'Low'})

# Use converters parameter to pass through condition that will change value dependant on said condition
df.head(3)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,Low
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,Low
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,High


In [None]:
# Solution 2: Using csv reader
import csv
with open('BostonHousing.csv', 'r') as f:
    reader = csv.reader(f)
    out = []
    for i, row in enumerate(reader):
        if i > 0:
            # row[13] is the location of the medv value. Convert to float so data is read correctly
            row[13] = 'High' if float(row[13]) > 25 else 'Low'
        out.append(row)

        # Use from second row appended to list as data and the first row appended as column headers
df = pd.DataFrame(out[1:], columns=out[0])
print(df.head())

# 35. How to create a dataframe with rows as strides from a given series?
Difficiulty Level: L3

In [None]:
L = pd.Series(range(15))

# 37. How to get the nrows, ncolumns, datatype, summary stats of each column of a dataframe? Also get the array and list equivalent.
Difficulty Level: L2

Get the number of rows, columns, datatype and summary statistics of each column of the Cars93 dataset. Also get the numpy array and list equivalent of the dataframe.

In [356]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', usecols=['crim', 'medv'])

In [357]:
df.head(3)

Unnamed: 0,crim,medv
0,0.00632,24.0
1,0.02731,21.6
2,0.02729,34.7


In [358]:
#  number of rows and columns
print(df.shape)

(506, 2)


In [359]:
# datatypes
print(df.dtypes)

crim    float64
medv    float64
dtype: object


In [360]:
# how many columns under each dtype
print(df.get_dtype_counts())

float64    2
dtype: int64


In [361]:
print(df.dtypes.value_counts())


float64    2
dtype: int64


In [362]:
# summary statistics
df_stats = df.describe()
df_stats

Unnamed: 0,crim,medv
count,506.0,506.0
mean,3.613524,22.532806
std,8.601545,9.197104
min,0.00632,5.0
25%,0.082045,17.025
50%,0.25651,21.2
75%,3.677082,25.0
max,88.9762,50.0


In [363]:
# numpy array 
df_arr = df.values
df_arr

array([[6.3200e-03, 2.4000e+01],
       [2.7310e-02, 2.1600e+01],
       [2.7290e-02, 3.4700e+01],
       ...,
       [6.0760e-02, 2.3900e+01],
       [1.0959e-01, 2.2000e+01],
       [4.7410e-02, 1.1900e+01]])

In [364]:
# list
df_list = df.values.tolist()
df_list

[[0.00632, 24.0],
 [0.02731, 21.6],
 [0.02729, 34.7],
 [0.032369999999999996, 33.4],
 [0.06905, 36.2],
 [0.02985, 28.7],
 [0.08829, 22.9],
 [0.14455, 27.1],
 [0.21124, 16.5],
 [0.17004, 18.9],
 [0.22489, 15.0],
 [0.11747, 18.9],
 [0.09378, 21.7],
 [0.62976, 20.4],
 [0.6379600000000001, 18.2],
 [0.62739, 19.9],
 [1.05393, 23.1],
 [0.7842, 17.5],
 [0.80271, 20.2],
 [0.7258, 18.2],
 [1.25179, 13.6],
 [0.8520399999999999, 19.6],
 [1.2324700000000002, 15.2],
 [0.9884299999999999, 14.5],
 [0.75026, 15.6],
 [0.84054, 13.9],
 [0.67191, 16.6],
 [0.9557700000000001, 14.8],
 [0.77299, 18.4],
 [1.00245, 21.0],
 [1.13081, 12.7],
 [1.3547200000000001, 14.5],
 [1.38799, 13.2],
 [1.15172, 13.1],
 [1.6128200000000001, 13.5],
 [0.06417, 18.9],
 [0.09744, 20.0],
 [0.08014, 21.0],
 [0.17505, 24.7],
 [0.027630000000000002, 30.8],
 [0.033589999999999995, 34.9],
 [0.12744, 26.6],
 [0.1415, 25.3],
 [0.15936, 24.7],
 [0.12269000000000001, 21.2],
 [0.17142000000000002, 19.3],
 [0.18836, 20.0],
 [0.2292699999999

# 38. How to extract the row and column number of a particular cell with given criterion?

In [365]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
df.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,38.7,18.0,25.0,Driver & Passenger,Front,...,5.0,195.0,115.0,71.0,38.0,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20.0,26.0,Driver only,Front,...,5.0,180.0,102.0,67.0,37.0,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,44.6,19.0,26.0,Driver & Passenger,,...,6.0,193.0,106.0,,37.0,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,,22.0,30.0,,Rear,...,4.0,186.0,109.0,69.0,39.0,27.0,13.0,3640.0,non-USA,BMW 535i


### Which manufacturer, model and type has the highest Price? What is the row and column number of the cell with the highest Price value?

In [366]:
df["Price"].max()

61.9

In [367]:
# Create a boolean as a filter
mask = df["Price"] == df["Price"].max()

In [368]:
# Use .loc() method to with the filter and extract the particular columns
df.loc[mask, ["Manufacturer", "Model", "Type"]]

Unnamed: 0,Manufacturer,Model,Type
58,Mercedes-Benz,300E,Midsize


# 39. How to rename a specific columns in a dataframe?
Difficulty Level: L2

Rename the column Type as CarType in df and replace the ‘.’ in column names with ‘_’.

In [369]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
print(df.columns)

Index(['Manufacturer', 'Model', 'Type', 'Min.Price', 'Price', 'Max.Price',
       'MPG.city', 'MPG.highway', 'AirBags', 'DriveTrain', 'Cylinders',
       'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile', 'Man.trans.avail',
       'Fuel.tank.capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn.circle', 'Rear.seat.room', 'Luggage.room', 'Weight', 'Origin',
       'Make'],
      dtype='object')


In [370]:
old = df.columns.tolist()

In [371]:
new = [o.replace('.', '_') for o in old]

In [372]:
dictionary = dict(zip(old, new))
dictionary

{'Manufacturer': 'Manufacturer',
 'Model': 'Model',
 'Type': 'Type',
 'Min.Price': 'Min_Price',
 'Price': 'Price',
 'Max.Price': 'Max_Price',
 'MPG.city': 'MPG_city',
 'MPG.highway': 'MPG_highway',
 'AirBags': 'AirBags',
 'DriveTrain': 'DriveTrain',
 'Cylinders': 'Cylinders',
 'EngineSize': 'EngineSize',
 'Horsepower': 'Horsepower',
 'RPM': 'RPM',
 'Rev.per.mile': 'Rev_per_mile',
 'Man.trans.avail': 'Man_trans_avail',
 'Fuel.tank.capacity': 'Fuel_tank_capacity',
 'Passengers': 'Passengers',
 'Length': 'Length',
 'Wheelbase': 'Wheelbase',
 'Width': 'Width',
 'Turn.circle': 'Turn_circle',
 'Rear.seat.room': 'Rear_seat_room',
 'Luggage.room': 'Luggage_room',
 'Weight': 'Weight',
 'Origin': 'Origin',
 'Make': 'Make'}

In [373]:
dictionary["Type"] = "CarType"
dictionary["Type"]

'CarType'

In [374]:
df.rename(columns = dictionary)

Unnamed: 0,Manufacturer,Model,CarType,Min_Price,Price,Max_Price,MPG_city,MPG_highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn_circle,Rear_seat_room,Luggage_room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,38.7,18.0,25.0,Driver & Passenger,Front,...,5.0,195.0,115.0,71.0,38.0,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20.0,26.0,Driver only,Front,...,5.0,180.0,102.0,67.0,37.0,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,44.6,19.0,26.0,Driver & Passenger,,...,6.0,193.0,106.0,,37.0,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,,22.0,30.0,,Rear,...,4.0,186.0,109.0,69.0,39.0,27.0,13.0,3640.0,non-USA,BMW 535i
5,Buick,Century,Midsize,14.2,15.7,17.3,22.0,31.0,Driver only,,...,6.0,189.0,105.0,69.0,41.0,28.0,16.0,,USA,Buick Century
6,Buick,LeSabre,Large,19.9,20.8,,19.0,28.0,Driver only,Front,...,6.0,200.0,111.0,74.0,42.0,30.5,17.0,3470.0,USA,Buick LeSabre
7,Buick,Roadmaster,Large,22.6,23.7,24.9,16.0,25.0,Driver only,Rear,...,6.0,216.0,116.0,78.0,45.0,30.5,21.0,4105.0,USA,Buick Roadmaster
8,Buick,Riviera,Midsize,26.3,26.3,26.3,19.0,27.0,Driver only,Front,...,5.0,198.0,108.0,,41.0,26.5,14.0,3495.0,USA,Buick Riviera
9,Cadillac,DeVille,Large,33.0,34.7,36.3,16.0,25.0,Driver only,Front,...,6.0,206.0,114.0,73.0,43.0,35.0,18.0,3620.0,USA,Cadillac DeVille


#### #  Other Solution


In [None]:
# Step 1:
df=df.rename(columns = {'Type':'CarType'})
# or
df.columns.values[2] = "CarType"

# Step 2:
df.columns = df.columns.map(lambda x: x.replace('.', '_'))

# 40. How to check if a dataframe has any missing values?
Difficulty Level: L1

Check if df has any missing values.

In [375]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

In [377]:
df.isnull().head(2)

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [378]:
df.isnull().values

array([[False, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False]])

In [380]:
df.isnull().any().head(3)

Manufacturer    True
Model           True
Type            True
dtype: bool

In [381]:
df.isnull().values.any()

True

# 41. How to count the number of missing values in each column?
Difficulty Level: L2

Count the number of missing values in each column of df. Which column has the maximum number of missing values?

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

In [None]:
df.head(3)

In [None]:
df.index.max()

In [None]:
# Returns the dataframe but with booleans if that value is null
df.isnull().head()

In [None]:
# Method chaining with previous method and then summing up all values that are true, i.e axis = 1
df.isnull().sum(axis = 0).head(3)

In [None]:
# Find the index position of the maximum number of null values
df.isnull().sum(axis = 0).idxmax()

# 42. How to replace missing values of multiple numeric columns with the mean?
Difficulty Level: L2

Replace missing values in Min.Price and Max.Price columns with their respective mean.

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

In [None]:
df.head()

In [None]:
df["Min.Price"].mean()

In [None]:
df["Max.Price"].mean()

In [None]:
min_mean = df["Min.Price"].mean()
max_mean = df["Max.Price"].mean()
df["Min.Price"] = df["Min.Price"].fillna(min_mean)
df["Max.Price"] = df["Max.Price"].fillna(max_mean)
df

In [None]:
# Alternative Solution
df_out = df[['Min.Price', 'Max.Price']] = df[['Min.Price', 'Max.Price']].apply(lambda x: x.fillna(x.mean()))
print(df_out.head())

# 43. How to use apply function on existing columns with global variables as additional arguments?
Difficulty Level: L3

In df, use apply method to replace the missing values in Min.Price with the column’s mean and those in Max.Price with the column’s median.



In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

In [None]:
min_mean = df["Min.Price"].mean()

In [None]:
max_med = df["Max.Price"].median()

In [None]:
d = {'Min.Price': np.nanmean, 'Max.Price': np.nanmedian}

# for lambda function supply both x and d, x being the value you are currently on. Fill any null values with the value
# from the dictionary that 
df[['Min.Price', 'Max.Price']] = df[['Min.Price', 'Max.Price']].apply(lambda x, d: x.fillna(d[x.name](x)), args=(d, ))

In [None]:
df[['Min.Price', 'Max.Price']]

# 44. How to select a specific column from a dataframe as a dataframe instead of a series?
Difficulty Level: L2

Get the first column (a) in df as a dataframe (rather than as a Series).

In [None]:
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
df.head()

In [None]:
df2 = pd.DataFrame(df["a"])
df2

In [None]:
type(df2)

# 45. How to change the order of columns of a dataframe?
Difficulty Level: L3

Actually 3 questions.

1. In df, interchange columns 'a' and 'c'.

2. Create a generic function to interchange two columns, without hardcoding column names.

3. Sort the columns in reverse alphabetical order, that is colume 'e' first through column 'a' last.

In [None]:
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
df.head(3)

In [None]:
# First solution
df[list('cbade')]

In [None]:
# Solution Q2 - No hard coding
def switch_columns(df, col1=None, col2=None):
    # Extract column names as list and assign to variable
    colnames = df.columns.tolist()
    
    # Set variables that are the index of each column name in the colnames list
    i1, i2 = colnames.index(col1), colnames.index(col2)
    
    # switch positions in the list of each column name
    colnames[i2], colnames[i1] = colnames[i1], colnames[i2]
    
    # return a dataframe with the switched columns
    return df[colnames]

# Call function and assign to new dataframe
df1 = switch_columns(df, 'a', 'c')

In [None]:
# Question 3
df.sort_index(axis=1, ascending=False, inplace=True)

# 46. How to set the number of rows and columns displayed in the output?
Difficulty Level: L2

Change the pamdas display settings on printing the dataframe df it shows a maximum of 10 rows and 10 columns.

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

In [None]:
pd.options.display.max_rows = 10
pd.options.display.max_columns = 10

In [None]:
df

In [None]:
# Alternative solution
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)


# 47. How to format or suppress scientific notations in a pandas dataframe?
Difficulty Level: L2

Suppress scientific notations like ‘e-03’ in df and print upto 4 numbers after decimal.

In [382]:
df = pd.DataFrame(np.random.random(4)**10, columns=['random'])

In [383]:
df

Unnamed: 0,random
0,1.325165e-06
1,1.574542e-08
2,0.001184339
3,9.313461e-06


In [384]:
df.round(4)

Unnamed: 0,random
0,0.0
1,0.0
2,0.0012
3,0.0


# 48. How to format all the values in a dataframe as percentages?
Difficulty Level: L2

Format the values in column 'random' of df as percentages.



In [385]:
df = pd.DataFrame(np.random.random(4), columns=['random'])

In [386]:
df

Unnamed: 0,random
0,0.382925
1,0.014589
2,0.45556
3,0.532633


In [387]:
# Create a series with values to 2dp and multiply them all by 100, keep the index the same as the df index
df['random'] = pd.Series(["{0:.2f}%".format(val * 100) for val in df['random']], index = df.index)

In [388]:
df

Unnamed: 0,random
0,38.29%
1,1.46%
2,45.56%
3,53.26%


# 49. How to filter every nth row in a dataframe?
Difficulty Level: L1

From df, filter the 'Manufacturer', 'Model' and 'Type' for every 20th row starting from 1st (row 0).

In [389]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

In [390]:
df.head(3)

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,38.7,18.0,25.0,Driver & Passenger,Front,...,5.0,195.0,115.0,71.0,38.0,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20.0,26.0,Driver only,Front,...,5.0,180.0,102.0,67.0,37.0,28.0,14.0,3375.0,non-USA,Audi 90


In [391]:
df[['Manufacturer', 'Model', 'Type']].iloc[0::20, :]

Unnamed: 0,Manufacturer,Model,Type
0,Acura,Integra,Small
20,Chrysler,LeBaron,Compact
40,Honda,Prelude,Sporty
60,Mercury,Cougar,Midsize
80,Subaru,Loyale,Small


In [392]:
# Alternative solution
df.iloc[::20, :][['Manufacturer', 'Model', 'Type']]

Unnamed: 0,Manufacturer,Model,Type
0,Acura,Integra,Small
20,Chrysler,LeBaron,Compact
40,Honda,Prelude,Sporty
60,Mercury,Cougar,Midsize
80,Subaru,Loyale,Small


# 50. How to create a primary key index by combining relevant columns?
Difficulty Level: L2

In df, Replace NaNs with ‘missing’ in columns 'Manufacturer', 'Model' and 'Type' and create a index as a combination of these three columns and check if the index is a primary key.



In [394]:
df[['Manufacturer', 'Model', 'Type']].isnull().head()

Unnamed: 0,Manufacturer,Model,Type
0,False,False,False
1,True,False,False
2,False,False,False
3,False,False,False
4,False,False,False


In [396]:
df[['Manufacturer', 'Model', 'Type']] = df[['Manufacturer', 'Model', 'Type']].fillna("missing")

# Give the index a name by concatenating the other values in that row
df.index = df.Manufacturer + '_' + df.Model + '_' + df.Type
df.head(2)

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
Acura_Integra_Small,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
missing_Legend_Midsize,missing,Legend,Midsize,29.2,33.9,38.7,18.0,25.0,Driver & Passenger,Front,...,5.0,195.0,115.0,71.0,38.0,30.0,15.0,3560.0,non-USA,Acura Legend


# 51. How to get the row number of the nth largest value in a column?
Difficulty Level: L2

Find the row position of the 5th largest value of column 'a' in df.

In [None]:
df = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10,-1), columns=list('abc'))
df

In [None]:
df["a"].nlargest(5)[-1:]

In [None]:
df['a'].argsort()

# 53. How to get the last n rows of a dataframe with row sum > 100?
Difficulty Level: L2

Get the last two rows of df whose row sum is greater than 100.

In [None]:
df = pd.DataFrame(np.random.randint(10, 40, 60).reshape(-1, 4))
df.head(3)

In [None]:
df["sum"] = df.sum(axis=1)
df

In [None]:
mask = df["sum"] > 100
df[mask].sort_values("sum", ascending = False, inplace = True)

In [None]:
df.tail(2)

In [None]:
# Alternative solution
rowsums = df.apply(np.sum, axis=1)
last_two_rows = df.iloc[np.where(rowsums > 100)[0][-2:], :]
last_two_rows

# 54. How to find and cap outliers from a series or dataframe column?
Difficulty Level: L2

Replace all values of ser in the lower 5%ile and greater than 95%ile with respective 5th and 95th %ile value.



In [None]:
ser = pd.Series(np.logspace(-2, 2, 30))
ser

In [None]:
percentiles = np.percentile(ser, q=[5, 95])

In [None]:
fifth, ninetyfifth = percentiles[0], percentiles[1]

In [None]:
ser[ser < fifth] = fifth
ser[ser > ninetyfifth] = ninetyfifth
ser


In [None]:
# Alternative solution
ser = pd.Series(np.logspace(-2, 2, 30))

In [None]:
def cap_outliers(ser, low_perc, high_perc):
    low, high = ser.quantile([low_perc, high_perc])
    print(low_perc, '%ile: ', low, '|', high_perc, '%ile: ', high)
    ser[ser < low] = low
    ser[ser > high] = high
    return(ser)

In [None]:
capped_ser = cap_outliers(ser, .05, .95)

# 55. How to reshape a dataframe to the largest possible square after removing the negative values?
Difficulty Level: L3

Reshape df to the largest possible square with negative values removed. Drop the smallest values if need be. The order of the positive numbers in the result should remain the same as the original.

In [None]:
df = pd.DataFrame(np.random.randint(-20, 50, 100).reshape(10,-1))
df

In [None]:
# Step 1: remove negative values from arr
arr = df[df > 0].values.flatten()
arr_qualified = arr[~np.isnan(arr)]

In [None]:
# Step 2: find side-length of largest possible square
n = int(np.floor(arr_qualified.shape[0]**.5))

In [None]:
# Step 3: Take top n^2 items without changing positions
top_indexes = np.argsort(arr_qualified)[::-1]
output = np.take(arr_qualified, sorted(top_indexes[:n**2])).reshape(n, -1)
print(output)

# 56. How to swap two rows of a dataframe?
Difficulty Level: L2

Swap rows 1 and 2 in df.



In [None]:
df = pd.DataFrame(np.arange(25).reshape(5, -1))
df

In [None]:
df.reindex([0,2,1,3,4])

# Alternative Solution Swap Rows function

In [None]:

def swap_rows(df, i1, i2):
    a, b = df.iloc[i1, :].copy(), df.iloc[i2, :].copy()
    df.iloc[i1, :], df.iloc[i2, :] = b, a
    return df

print(swap_rows(df, 1, 2))

# 57. How to reverse the rows of a dataframe?
Difficulty Level: L2

Reverse all the rows of dataframe df.

In [None]:
df = pd.DataFrame(np.arange(25).reshape(5, -1))

In [None]:
df

In [None]:
new_index = df.index.tolist()

In [None]:
new_index.reverse()


In [None]:
new_index

In [None]:
df.reindex(new_index)

In [None]:
# Alternative Solution 1
df.iloc[::-1, :]

# Alternative Solution 2
print(df.loc[df.index[::-1], :])

# 59. Which column contains the highest number of row-wise maximum values?
Difficulty Level: L2

Obtain the column name with the highest number of row-wise maximum’s in df.

In [None]:
df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1))

In [None]:
df

In [None]:
# numpy.argmax(a, axis=None, out=None)[source]
# Returns the indices of the maximum values along an axis.

df.apply(np.argmax, axis=1)

In [None]:
df.apply(np.argmax, axis=1).value_counts()

In [None]:
df.apply(np.argmax, axis=1).value_counts().index[0]

# 60. How to create a new column that contains the row number of nearest column by euclidean distance?
Create a new column such that, each row contains the row number of nearest row-record by euclidean distance.

Difficulty Level: L3

In [None]:
df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1), columns=list('pqrs'), index=list('abcdefghij'))
df

In [None]:
# init outputs
nearest_rows = []
nearest_distance = []

# iterate rows.
for i, row in df.iterrows():
    curr = row
    rest = df.drop(i)
    e_dists = {}  # init dict to store euclidean dists for current row.
    # iterate rest of rows for current row
    for j, contestant in rest.iterrows():
        # compute euclidean dist and update e_dists
        e_dists.update({j: round(np.linalg.norm(curr.values - contestant.values))})
    # update nearest row to current row and the distance value
    nearest_rows.append(max(e_dists, key=e_dists.get))
    nearest_distance.append(max(e_dists.values()))

df['nearest_row'] = nearest_rows
df['dist'] = nearest_distance

# 61. How to know the maximum possible correlation value of each column against other columns?
Difficulty Level: L2

Compute maximum possible absolute correlation value of each column against other columns in df.

In [None]:
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1), columns=list('pqrstuvwxy'), index=list('abcdefgh'))

In [None]:
df

In [None]:
# np.abs converts all negatives to positive
abs_corr = np.abs(df.corr())
abs_corr

In [None]:
max_corr = abs_corr.apply(lambda x: sorted(x)[-2])

In [None]:
np.round(max_corr, 2)

# 62. How to create a column containing the minimum by maximum of each row?
Difficulty Level: L2

Compute the minimum-by-maximum for every row of df.

In [None]:
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))

In [None]:
df.head()

In [None]:
df["max_by_min"] = df.max() * df.min()
df.head()

In [None]:
# Alternate solutions
min_by_max = df.apply(lambda x: np.min(x)*np.max(x), axis=1)
min_by_max = np.min(df, axis=1)*np.max(df, axis=1)

In [None]:
min_by_max 

# 63. How to create a column that contains the penultimate value in each row?
Difficulty Level: L2

Create a new column 'penultimate' which has the second largest value of each row of df.

In [None]:
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))

In [None]:
df.head()

In [None]:
df.values


In [None]:
second_max = []
for i in df.values:
    i.sort()
    second_max.append(i[-2])
second_max

In [None]:
ser = pd.Series(second_max)
ser

In [None]:
df["penultimate"] = ser
df

In [None]:
# Alternative solution

# Input
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))

# Solution
out = df.apply(lambda x: x.sort_values().unique()[-2], axis=1)
df['penultimate'] = out
print(df)

# 64. How to normalize all columns in a dataframe?
Difficulty Level: L2

Normalize all columns of df by subtracting the column mean and divide by standard deviation.
Range all columns of df such that the minimum value in each column is 0 and max is 1.
Don’t use external packages like sklearn.

In [None]:
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))

In [None]:

df.apply(lambda x: np.abs(((x - x.mean())/x.std()).round(2)))


# 66. How to replace both the diagonals of dataframe with 0?
Difficulty Level: L2

Replace both values in both diagonals of df with 0.

In [129]:
df = pd.DataFrame(np.random.randint(1,100, 100).reshape(10, -1))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,33,4,52,5,39,28,62,51,98,8
1,67,69,19,26,11,37,53,43,86,11
2,52,78,68,16,18,13,57,91,46,92
3,74,36,77,46,87,74,51,68,32,42
4,18,8,12,54,64,18,34,78,8,45
5,13,6,7,26,28,68,91,80,92,5
6,23,6,41,38,18,42,76,28,92,81
7,54,73,64,84,27,36,85,67,64,71
8,59,83,37,42,61,61,85,97,49,10
9,12,51,46,28,47,54,61,92,39,92


In [130]:
for i in range(df.shape[0]):
    df.iat[i, i] = 0
    

In [131]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,4,52,5,39,28,62,51,98,8
1,67,0,19,26,11,37,53,43,86,11
2,52,78,0,16,18,13,57,91,46,92
3,74,36,77,0,87,74,51,68,32,42
4,18,8,12,54,0,18,34,78,8,45
5,13,6,7,26,28,0,91,80,92,5
6,23,6,41,38,18,42,0,28,92,81
7,54,73,64,84,27,36,85,0,64,71
8,59,83,37,42,61,61,85,97,0,10
9,12,51,46,28,47,54,61,92,39,0


# 67. How to get the particular group of a groupby dataframe by key?
Difficulty Level: L2

This is a question related to understanding of grouped dataframe. From df_grouped, get the group belonging to 'apple' as a dataframe.

In [150]:
df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 3,
                   'col2': np.random.rand(9),
                   'col3': np.random.randint(0, 15, 9)})



In [152]:
# Group data by column one values
df_grouped = df.groupby(["col1"])

In [156]:
# Get groups where col1 value is equal to apple
df_grouped.get_group('apple')

Unnamed: 0,col1,col2,col3
0,apple,0.670215,9
3,apple,0.856239,8
6,apple,0.613428,6


# 68. How to get the n’th largest value of a column when grouped by another column?
Difficulty Level: L2

In df, find the second largest value of 'taste' for 'banana'

In [163]:
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'taste': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})

In [165]:
df_grouped = df.groupby(df['fruit'])
banana_df = df_grouped.get_group("banana")
banana_df

Unnamed: 0,fruit,taste,price
1,banana,0.080269,2
4,banana,0.635344,11
7,banana,0.984971,11


In [190]:
taste_values = banana_df["taste"].values
taste_values.sort()
taste_values[1]


0.635344412841703

In [191]:
# Alternate solution
df_grpd = df['taste'].groupby(df.fruit)
df_grpd.get_group('banana').sort_values().iloc[-2]

0.635344412841703

# 69. How to compute grouped mean on pandas dataframe and keep the grouped column as another column (not index)?
Difficulty Level: L1

In df, Compute the mean price of every fruit, while keeping the fruit as another column instead of an index.

In [220]:
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'rating': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})

In [221]:
df['fruit'].astype('category')

0     apple
1    banana
2    orange
3     apple
4    banana
5    orange
6     apple
7    banana
8    orange
Name: fruit, dtype: category
Categories (3, object): [apple, banana, orange]

In [228]:
# Use parameter as_index to prevent fruit becoming index
out = df.groupby('fruit', as_index = False)["price"].mean()

In [229]:
out

Unnamed: 0,fruit,price
0,apple,8.0
1,banana,5.666667
2,orange,13.333333


# 70. How to join two dataframes by 2 columns so they have only the common rows?
Difficulty Level: L2

Join dataframes df1 and df2 by ‘fruit-pazham’ and ‘weight-kilo’.



In [230]:
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'kilo': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})

In [233]:
df1

Unnamed: 0,fruit,weight,price
0,apple,high,0
1,banana,medium,11
2,orange,low,10
3,apple,high,11
4,banana,medium,2
5,orange,low,13
6,apple,high,14
7,banana,medium,4
8,orange,low,3


In [234]:
df2

Unnamed: 0,pazham,kilo,price
0,apple,high,12
1,orange,low,0
2,pine,high,12
3,apple,low,12
4,orange,high,10
5,pine,low,6


In [236]:
df1.merge(df2, how = "left", left_on = ["fruit", "weight"], right_on = ["pazham", "kilo"], sort = True)
# axis must be set to one to look through columns and not rows

Unnamed: 0,fruit,weight,price_x,pazham,kilo,price_y
0,apple,high,0,apple,high,12.0
1,apple,high,11,apple,high,12.0
2,apple,high,14,apple,high,12.0
3,banana,medium,11,,,
4,banana,medium,2,,,
5,banana,medium,4,,,
6,orange,low,10,orange,low,0.0
7,orange,low,13,orange,low,0.0
8,orange,low,3,orange,low,0.0


In [237]:
df1.merge(df2, how = "left", left_on = ["fruit", "weight"], right_on = ["pazham", "kilo"], sort = True).drop(["pazham", "kilo"], axis = 1)

Unnamed: 0,fruit,weight,price_x,price_y
0,apple,high,0,12.0
1,apple,high,11,12.0
2,apple,high,14,12.0
3,banana,medium,11,
4,banana,medium,2,
5,banana,medium,4,
6,orange,low,10,0.0
7,orange,low,13,0.0
8,orange,low,3,0.0


# 71. How to remove rows from a dataframe that are present in another dataframe?
Difficulty Level: L3

From df1, remove the rows that are present in df2. All three columns must be the same.

In [242]:
df1 = pd.DataFrame({'fruit': ['apple', 'orange', 'banana'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.arange(9)})

df2 = pd.DataFrame({'fruit': ['apple', 'orange', 'pine'] * 2,
                    'weight': ['high', 'medium'] * 3,
                    'price': np.arange(6)})

In [249]:
df1

Unnamed: 0,fruit,weight,price
0,apple,high,0
1,orange,medium,1
2,banana,low,2
3,apple,high,3
4,orange,medium,4
5,banana,low,5
6,apple,high,6
7,orange,medium,7
8,banana,low,8


In [244]:
df2.head(2)

Unnamed: 0,fruit,weight,price
0,apple,high,0
1,orange,medium,1


In [245]:
df1.isin(df2)

Unnamed: 0,fruit,weight,price
0,True,True,True
1,True,True,True
2,False,False,True
3,True,False,True
4,True,False,True
5,False,False,True
6,False,False,False
7,False,False,False
8,False,False,False


In [247]:
df1.isin(df2).all(axis=1)

0     True
1     True
2    False
3    False
4    False
5    False
6    False
7    False
8    False
dtype: bool

In [248]:
df1[~df1.isin(df2).all(axis=1)]

Unnamed: 0,fruit,weight,price
2,banana,low,2
3,apple,high,3
4,orange,medium,4
5,banana,low,5
6,apple,high,6
7,orange,medium,7
8,banana,low,8


# 72. How to get the positions where values of two columns match?

In [250]:
df = pd.DataFrame({'fruit1': np.random.choice(['apple', 'orange', 'banana'], 10),
                    'fruit2': np.random.choice(['apple', 'orange', 'banana'], 10)})

In [251]:
df

Unnamed: 0,fruit1,fruit2
0,apple,banana
1,apple,apple
2,orange,banana
3,banana,apple
4,orange,orange
5,apple,apple
6,banana,apple
7,banana,banana
8,orange,apple
9,apple,banana


In [253]:
np.where(df.fruit1 == df.fruit2)

(array([1, 4, 5, 7]),)

In [255]:
df.index.where(df.fruit1 == df.fruit2)

Float64Index([nan, 1.0, nan, nan, 4.0, 5.0, nan, 7.0, nan, nan], dtype='float64')

# 73. How to create lags and leads of a column in a dataframe?
Difficulty Level: L2

Create two new columns in df, one of which is a lag1 (shift column a down by 1 row) of column ‘a’ and the other is a lead1 (shift column b up by 1 row).

In [256]:
df = pd.DataFrame(np.random.randint(1, 100, 20).reshape(-1, 4), columns = list('abcd'))
df

Unnamed: 0,a,b,c,d
0,41,93,19,67
1,87,65,29,90
2,65,10,52,47
3,55,67,40,34
4,97,56,73,1


In [257]:
df["lag1"] = df["a"].shift(periods = 1)
df["lead1"] = df["b"].shift(periods = -1)
df

Unnamed: 0,a,b,c,d,lag1,lead1
0,41,93,19,67,,65.0
1,87,65,29,90,41.0,10.0
2,65,10,52,47,87.0,67.0
3,55,67,40,34,65.0,56.0
4,97,56,73,1,55.0,


# 74. How to get the frequency of unique values in the entire dataframe?
Difficulty Level: L2

Get the frequency of unique values in the entire dataframe df.

In [258]:
df = pd.DataFrame(np.random.randint(1, 10, 20).reshape(-1, 4), columns = list('abcd'))
df

Unnamed: 0,a,b,c,d
0,9,2,5,1
1,4,3,2,8
2,2,3,3,5
3,2,5,7,6
4,3,3,2,4


In [266]:
df.values

array([[9, 2, 5, 1],
       [4, 3, 2, 8],
       [2, 3, 3, 5],
       [2, 5, 7, 6],
       [3, 3, 2, 4]])

In [267]:
df.values.flatten()

array([9, 2, 5, 1, 4, 3, 2, 8, 2, 3, 3, 5, 2, 5, 7, 6, 3, 3, 2, 4])

In [269]:
flat_list = df.values.flatten()

In [272]:
flat_list = set(flat_list)
flat_list

{1, 2, 3, 4, 5, 6, 7, 8, 9}

In [273]:
len(flat_list)

9

In [274]:
len(set(df.values.flatten()))

9

# 75. How to split a text column into two separate columns?
Difficulty Level: L2

Split the string column in df to form a dataframe with 3 columns as shown.



In [304]:
df = pd.DataFrame(["STD, City, State",
"33, Kolkata, West Bengal",
"44, Chennai, Tamil Nadu",
"40, Hyderabad, Telengana",
"80, Bangalore, Karnataka"], columns=['row'])

df

Unnamed: 0,row
0,"STD, City, State"
1,"33, Kolkata, West Bengal"
2,"44, Chennai, Tamil Nadu"
3,"40, Hyderabad, Telengana"
4,"80, Bangalore, Karnataka"


In [311]:
# Split the strings out across the new columns
df_out = df.row.str.split(',|\t', expand=True)
# Make first row as header
new_header = df_out.iloc[0]
# Remove column headers from data
df_out = df_out[1:]
# Set new headers as column names
df_out.columns = new_header
df_out

Unnamed: 0,STD,None,None.1
1,33,Kolkata,West Bengal
2,44,Chennai,Tamil Nadu
3,40,Hyderabad,Telengana
4,80,Bangalore,Karnataka


# duhfsk