### 101 Pandas Exercises for Data Analysis
https://www.machinelearningplus.com/python/101-pandas-exercises-python/

In [52]:
import numpy as np
import pandas as pd

### 41. Count the number of missing values in each column of df. Which column has the maximum number of missing values?

In [53]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
df.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,...,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,...,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,...,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,...,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,...,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,...,27.0,13.0,3640.0,non-USA,BMW 535i


In [54]:
# Solution 1
n_missings_each_col = df.apply(lambda x: x.isnull().sum())
n_missings_each_col

Manufacturer       4
Model              1
Type               3
Min.Price          7
Price              2
                  ..
Rear.seat.room     4
Luggage.room      19
Weight             7
Origin             5
Make               3
Length: 27, dtype: int64

In [55]:
n_missings_each_col.argmax()

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  """Entry point for launching an IPython kernel.


'Luggage.room'

In [56]:
#Solution 2
df.isnull().sum().argmax()

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  


'Luggage.room'

### 42. Replace missing values in Min.Price and Max.Price columns with their respective mean.

In [57]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
df.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,...,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,...,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,...,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,...,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,...,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,...,27.0,13.0,3640.0,non-USA,BMW 535i


In [58]:
# Solution
df_out = df[['Min.Price', 'Max.Price']] = df[['Min.Price', 'Max.Price']].apply(lambda x: x.fillna(x.mean()))
df_out.head()

Unnamed: 0,Min.Price,Max.Price
0,12.9,18.8
1,29.2,38.7
2,25.9,32.3
3,17.118605,44.6
4,17.118605,21.459091


### 43. In df, use apply method to replace the missing values in Min.Price with the column’s mean and those in Max.Price with the column’s median.

In [59]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

In [60]:
df_out = df[['Min.Price']] = df[['Min.Price']].apply(lambda x: x.fillna(x.mean()))
df_out.head()

Unnamed: 0,Min.Price
0,12.9
1,29.2
2,25.9
3,17.118605
4,17.118605


In [61]:
temp = df[['Max.Price']].apply(lambda x: x.fillna(x.median()))
df_out['Max.Price'] = temp
df_out.head()

Unnamed: 0,Min.Price,Max.Price
0,12.9,18.8
1,29.2,38.7
2,25.9,32.3
3,17.118605,44.6
4,17.118605,19.15


In [62]:
# Solution ???
d = {'Min.Price': np.nanmean, 'Max.Price': np.nanmedian}
df[['Min.Price', 'Max.Price']] = df[['Min.Price', 'Max.Price']].apply(lambda x, d: x.fillna(d[x.name](x)), args=(d, ))
df[['Min.Price', 'Max.Price']].head()

Unnamed: 0,Min.Price,Max.Price
0,12.9,18.8
1,29.2,38.7
2,25.9,32.3
3,17.118605,44.6
4,17.118605,19.15


### 44. Get the first column (a) in df as a dataframe (rather than as a Series).

In [63]:
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
df

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [64]:
# Solution
type(df[['a']])

pandas.core.frame.DataFrame

In [65]:
type(df.loc[:, ['a']])

pandas.core.frame.DataFrame

In [66]:
type(df.iloc[:, [0]])

pandas.core.frame.DataFrame

In [67]:
type(df.a)

pandas.core.series.Series

In [68]:
type(df['a'])

pandas.core.series.Series

In [69]:
type(df.loc[:, 'a'])

pandas.core.series.Series

In [70]:
type(df.iloc[:, 1])

pandas.core.series.Series

### 45. How to change the order of columns of a dataframe?
Actually 3 questions.

1.In df, interchange columns 'a' and 'c'.

2.Create a generic function to interchange two columns, without hardcoding column names.

3.Sort the columns in reverse alphabetical order, that is colume 'e' first through column 'a' last.

In [71]:
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
df.head()

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [72]:
# Solution Q1
df[list('cbade')]

Unnamed: 0,c,b,a,d,e
0,2,1,0,3,4
1,7,6,5,8,9
2,12,11,10,13,14
3,17,16,15,18,19


In [73]:
# Solution Q2 - No hard coding
def switch_columns(df, col1=None, col2=None):
    colnames = df.columns.tolist()
    i1, i2 = colnames.index(col1), colnames.index(col2)
    colnames[i2], colnames[i1] = colnames[i1], colnames[i2]
    return df[colnames]

df1 = switch_columns(df, 'a', 'c')
df1.head()

Unnamed: 0,c,b,a,d,e
0,2,1,0,3,4
1,7,6,5,8,9
2,12,11,10,13,14
3,17,16,15,18,19


In [74]:
# Solution Q3
#df[sorted(df.columns)]
df[sorted(df.columns, reverse=True)]

Unnamed: 0,e,d,c,b,a
0,4,3,2,1,0
1,9,8,7,6,5
2,14,13,12,11,10
3,19,18,17,16,15


In [75]:
df.sort_index(axis=1, ascending=False, inplace=True)
df.head()

Unnamed: 0,e,d,c,b,a
0,4,3,2,1,0
1,9,8,7,6,5
2,14,13,12,11,10
3,19,18,17,16,15


### 46. Change the pamdas display settings on printing the dataframe df it shows a maximum of 10 rows and 10 columns.

In [76]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
df.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,...,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,...,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,...,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,...,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,...,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,...,27.0,13.0,3640.0,non-USA,BMW 535i


In [78]:
# Solution ???
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)
df

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,...,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,...,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,...,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,...,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,...,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,...,27.0,13.0,3640.0,non-USA,BMW 535i
...,...,...,...,...,...,...,...,...,...,...,...
88,Volkswagen,Eurovan,Van,16.6,19.7,...,34.0,,3960.0,,Volkswagen Eurovan
89,Volkswagen,Passat,Compact,17.6,20.0,...,31.5,14.0,2985.0,non-USA,Volkswagen Passat
90,Volkswagen,Corrado,Sporty,22.9,23.3,...,26.0,15.0,2810.0,non-USA,Volkswagen Corrado
91,Volvo,240,Compact,21.8,22.7,...,29.5,14.0,2985.0,non-USA,Volvo 240


### 47. Suppress scientific notations like ‘e-03’ in df and print upto 4 numbers after decimal.

In [92]:
df = pd.DataFrame(np.random.random(4)**10, columns=['random'])
df

Unnamed: 0,random
0,0.0
1,0.4738
2,0.0
3,0.2646


In [81]:
# Solution 1: Rounding
df.round(4)

Unnamed: 0,random
0,0.0101
1,0.1505
2,0.0007
3,0.0


In [84]:
# Solution 2: Use apply to change format
df.apply(lambda x: '%.4f' % x, axis=1)

0    0.0101
1    0.1505
2    0.0007
3    0.0000
dtype: object

In [85]:
# or
df.applymap(lambda x: '%.4f' % x)

Unnamed: 0,random
0,0.0101
1,0.1505
2,0.0007
3,0.0


In [89]:
# Solution 3: Use set_option
pd.set_option('display.float_format', lambda x: '%.4f' % x)
df

Unnamed: 0,random
0,0.333
1,0.0092
2,0.2309
3,0.0


In [91]:
# Solution 4: Assign display.float_format
pd.options.display.float_format = '{:.4f}'.format
df

Unnamed: 0,random
0,0.002
1,0.0139
2,0.0
3,0.0001


In [94]:
# Reset/undo float formatting
pd.options.display.float_format = None
df

Unnamed: 0,random
0,4.392369e-11
1,0.4738496
2,1.05942e-18
3,0.2646074


### 48. Format the values in column 'random' of df as percentages.

In [95]:
df = pd.DataFrame(np.random.random(4), columns=['random'])
df

Unnamed: 0,random
0,0.051534
1,0.517319
2,0.749627
3,0.987864


In [96]:
# Solution
out = df.style.format({'random': '{0:.2%}'.format,})
out

Unnamed: 0,random
0,5.15%
1,51.73%
2,74.96%
3,98.79%


### 49. From df, filter the 'Manufacturer', 'Model' and 'Type' for every 20th row starting from 1st (row 0).

In [98]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
df.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,...,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,...,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,...,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,...,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,...,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,...,27.0,13.0,3640.0,non-USA,BMW 535i


In [102]:
# Solution
df.iloc[::20, :][['Manufacturer', 'Model', 'Type']]

Unnamed: 0,Manufacturer,Model,Type
0,Acura,Integra,Small
20,Chrysler,LeBaron,Compact
40,Honda,Prelude,Sporty
60,Mercury,Cougar,Midsize
80,Subaru,Loyale,Small


### 50. In df, Replace NaNs with ‘missing’ in columns 'Manufacturer', 'Model' and 'Type' and create a index as a combination of these three columns and check if the index is a primary key.

In [103]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv', usecols=[0,1,2,3,5])
df.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price
0,Acura,Integra,Small,12.9,18.8
1,,Legend,Midsize,29.2,38.7
2,Audi,90,Compact,25.9,32.3
3,Audi,100,Midsize,,44.6
4,BMW,535i,Midsize,,


In [107]:
# Solution
df[['Manufacturer', 'Model', 'Type']] = df[['Manufacturer', 'Model', 'Type']].fillna('missing')
df.index = df.Manufacturer + '_' + df.Model + '_' + df.Type
print(df.index.is_unique)

True
